diff --git a/.gitignore b/.gitignore
index b5306b8b79c37166e5496cf17a3e39b86b9a6314..5afe375f46f07b3b557ae23f75740b337517d3bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ Podfile.lock
 /tensorflow/contrib/lite/examples/ios/simple/data/*.tflite
 xcuserdata/**
 /api_init_files_list.txt
+/estimator_api_init_files_list.txt
 
 # Android
 .gradle
diff --git a/RELEASE.md b/RELEASE.md
index 7bb1e3e1c8bd2b0471a8259cb2354d5f4f4b777a..6b67072f8ecafa08c747f8296c7c2a59eb2350fa 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -34,18 +34,22 @@
   * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See
     [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
 * `tf.data`:
-  * The `DatasetBase::DebugString()` method is now `const`.
-  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+  * `Dataset.from_generator()` now accepts an `args` list, in order to create nested generators.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * `tf.contrib.data.sample_from_datasets()` and `tf.contrib.data.choose_from_datasets()` make it easier to sample or deterministically choose elements from multiple datasets.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings, and two infrequently used arguments removed.
+  * (C++) `DatasetBase::DebugString()` is now `const`.
+  * (C++) `DatasetBase::MakeIterator()` has been renamed to `DatasetBase::MakeIteratorInternal()`.
+  * (C++) `IteratorBase::Initialize()` method was added to support raising errors during iterator construction.
 * Eager Execution:
+  * Added the ability to pause recording operations for gradient computation via `tf.GradientTape.stop_recording`.
+  * Updated documentation, introductory notebooks.
 * `tf.keras`:
   * Move Keras code out of _impl folder and remove API files.
   * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
   * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
-* Accelerated Linear Algebra (XLA):
-* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
+* TensorFlow Debugger (tfdbg) CLI: fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
 * `tf.contrib`:
-  * Add `tf.contrib.data.choose_from_datasets()`.
-  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
   * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
   * Adding "constrained_optimization" to tensorflow/contrib.
 * Other:
@@ -55,7 +59,6 @@
   * More consistent GcsFileSystem behavior for certain reads past EOF.
   * Update benchmark for tf.scan to match ranges across eager and graph modes.
   * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
-  * Add optional `args` argument to `Dataset.from_generator()`.
   * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
   * Benchmark for tf.scan in graph and eager modes.
   * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
@@ -65,7 +68,6 @@
   * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
   * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
   * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
-  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
   * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
   * Allow LinearOperator to broadcast.
   * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
diff --git a/WORKSPACE b/WORKSPACE
index fd7570a80ae2ee0087f7d2fd771fcce5b9690028..17961829a605c2d1f2d2ba86a7c30c47618c139b 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -18,7 +18,7 @@ closure_repositories()
 # files, in case the parsing of those build files depends on the bazel
 # version we require here.
 load("//tensorflow:version_check.bzl", "check_bazel_version_at_least")
-check_bazel_version_at_least("0.10.0")
+check_bazel_version_at_least("0.15.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
diff --git a/configure.py b/configure.py
index c482628ec88ed39c5cd31f2439215af524863602..f97bf8a66836a6647ba6aca625cb1526e11b39af 100644
--- a/configure.py
+++ b/configure.py
@@ -882,7 +882,7 @@ def set_tf_cudnn_version(environ_cp):
     default_cudnn_path = environ_cp.get('CUDA_TOOLKIT_PATH')
     ask_cudnn_path = (r'Please specify the location where cuDNN %s library is '
                       'installed. Refer to README.md for more details. [Default'
-                      ' is %s]:') % (tf_cudnn_version, default_cudnn_path)
+                      ' is %s]: ') % (tf_cudnn_version, default_cudnn_path)
     cudnn_install_path = get_from_env_or_user_or_default(
         environ_cp, 'CUDNN_INSTALL_PATH', ask_cudnn_path, default_cudnn_path)
 
@@ -1201,7 +1201,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         'https://developer.nvidia.com/cuda-gpus.\nPlease'
         ' note that each additional compute '
         'capability significantly increases your '
-        'build time and binary size. [Default is: %s]' %
+        'build time and binary size. [Default is: %s]: ' %
         default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
@@ -1402,14 +1402,36 @@ def set_build_strip_flag():
   write_to_bazelrc('build --strip=always')
 
 
-def set_windows_build_flags():
-  if is_windows():
-    # The non-monolithic build is not supported yet
-    write_to_bazelrc('build --config monolithic')
-    # Suppress warning messages
-    write_to_bazelrc('build --copt=-w --host_copt=-w')
-    # Output more verbose information when something goes wrong
-    write_to_bazelrc('build --verbose_failures')
+def set_windows_build_flags(environ_cp):
+  """Set Windows specific build options."""
+  # The non-monolithic build is not supported yet
+  write_to_bazelrc('build --config monolithic')
+  # Suppress warning messages
+  write_to_bazelrc('build --copt=-w --host_copt=-w')
+  # Output more verbose information when something goes wrong
+  write_to_bazelrc('build --verbose_failures')
+  # The host and target platforms are the same in Windows build. So we don't
+  # have to distinct them. This avoids building the same targets twice.
+  write_to_bazelrc('build --distinct_host_configuration=false')
+  # Enable short object file path to avoid long path issue on Windows.
+  # TODO(pcloudy): Remove this flag when upgrading Bazel to 0.16.0
+  # Short object file path will be enabled by default.
+  write_to_bazelrc('build --experimental_shortened_obj_file_path=true')
+
+  if get_var(
+      environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline',
+      True,
+      ('Would you like to override eigen strong inline for some C++ '
+       'compilation to reduce the compilation time?'),
+      'Eigen strong inline overridden.',
+      'Not overriding eigen strong inline, '
+      'some compilations could take more than 20 mins.'):
+    # Due to a known MSVC compiler issue
+    # https://github.com/tensorflow/tensorflow/issues/10521
+    # Overriding eigen strong inline speeds up the compiling of
+    # conv_grad_ops_3d.cc and conv_ops_3d.cc by 20 minutes,
+    # but this also hurts the performance. Let users decide what they want.
+    write_to_bazelrc('build --define=override_eigen_strong_inline=true')
 
 
 def config_info_line(name, help_text):
@@ -1429,7 +1451,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.10.0')
+  check_bazel_version('0.15.0')
 
   reset_tf_configure_bazelrc(args.workspace)
   cleanup_makefile()
@@ -1537,7 +1559,8 @@ def main():
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
   set_build_strip_flag()
-  set_windows_build_flags()
+  if is_windows():
+    set_windows_build_flags(environ_cp)
 
   if get_var(
       environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
@@ -1549,11 +1572,15 @@ def main():
     create_android_ndk_rule(environ_cp)
     create_android_sdk_rule(environ_cp)
 
-  print('Preconfigured Bazel build configs. You can use any of the below by '
-        'adding "--config=<>" to your build command. See tools/bazel.rc for '
-        'more details.')
-  config_info_line('mkl', 'Build with MKL support.')
-  config_info_line('monolithic', 'Config for mostly static monolithic build.')
+  # On Windows, we don't have MKL support and the build is always monolithic.
+  # So no need to print the following message.
+  # TODO(pcloudy): remove the following if check when they make sense on Windows
+  if not is_windows():
+    print('Preconfigured Bazel build configs. You can use any of the below by '
+          'adding "--config=<>" to your build command. See tools/bazel.rc for '
+          'more details.')
+    config_info_line('mkl', 'Build with MKL support.')
+    config_info_line('monolithic', 'Config for mostly static monolithic build.')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 518c2b0489021815c0480acb35a58717d6ca9359..60db234c9c56fcca32418fcc3b10385f8d82bd45 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -24,6 +24,14 @@ load(
     "gen_api_init_files",  # @unused
 )
 
+# Config setting used when building for products
+# which requires restricted licenses to be avoided.
+config_setting(
+    name = "no_lgpl_deps",
+    values = {"define": "__TENSORFLOW_NO_LGPL_DEPS__=1"},
+    visibility = ["//visibility:public"],
+)
+
 # Config setting for determining if we are building for Android.
 config_setting(
     name = "android",
@@ -373,6 +381,14 @@ config_setting(
     },
 )
 
+# Setting to use when loading kernels dynamically
+config_setting(
+    name = "dynamic_loaded_kernels",
+    define_values = {
+        "dynamic_loaded_kernels": "true",
+    },
+)
+
 config_setting(
     name = "using_cuda_nvcc",
     define_values = {
@@ -400,14 +416,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# TODO(laigd): consider removing this option and make TensorRT enabled
-# automatically when CUDA is enabled.
-config_setting(
-    name = "with_tensorrt_support",
-    values = {"define": "with_tensorrt_support=true"},
-    visibility = ["//visibility:public"],
-)
-
 package_group(
     name = "internal",
     packages = [
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 5c218d3f25e01f0e78916d4a5a8b1d2751f9dc25..10bc8cdbee5a9df6d2084c10adab4ed6e5e6f0d3 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eval_const_tensor.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -327,6 +328,7 @@ TF_Buffer* TF_NewBufferFromString(const void* proto, size_t proto_len) {
 }
 
 void TF_DeleteBuffer(TF_Buffer* buffer) {
+  if (buffer == nullptr) return;
   if (buffer->data_deallocator != nullptr) {
     (*buffer->data_deallocator)(const_cast<void*>(buffer->data),
                                 buffer->length);
@@ -356,6 +358,7 @@ void TF_CloseDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
 
 void TF_DeleteDeprecatedSession(TF_DeprecatedSession* s, TF_Status* status) {
   status->status = Status::OK();
+  if (s == nullptr) return;
   delete s->session;
   delete s;
 }
@@ -906,6 +909,7 @@ TF_Library* TF_LoadLibrary(const char* library_filename, TF_Status* status) {
 TF_Buffer TF_GetOpList(TF_Library* lib_handle) { return lib_handle->op_list; }
 
 void TF_DeleteLibraryHandle(TF_Library* lib_handle) {
+  if (lib_handle == nullptr) return;
   tensorflow::port::Free(const_cast<void*>(lib_handle->op_list.data));
   delete lib_handle;
 }
@@ -963,6 +967,7 @@ TF_DEVICELIST_METHOD(const char*, TF_DeviceListName, name().c_str(), nullptr);
 TF_DEVICELIST_METHOD(const char*, TF_DeviceListType, device_type().c_str(),
                      nullptr);
 TF_DEVICELIST_METHOD(int64_t, TF_DeviceListMemoryBytes, memory_limit(), -1);
+TF_DEVICELIST_METHOD(uint64_t, TF_DeviceListIncarnation, incarnation(), 0);
 
 #undef TF_DEVICELIST_METHOD
 
@@ -1852,6 +1857,7 @@ TF_Graph::TF_Graph()
 TF_Graph* TF_NewGraph() { return new TF_Graph; }
 
 void TF_DeleteGraph(TF_Graph* g) {
+  if (g == nullptr) return;
   g->mu.lock();
   g->delete_requested = true;
   const bool del = g->sessions.empty();
@@ -2527,6 +2533,7 @@ void TF_CloseSession(TF_Session* s, TF_Status* status) {
 
 void TF_DeleteSession(TF_Session* s, TF_Status* status) {
   status->status = Status::OK();
+  if (s == nullptr) return;
   TF_Graph* const graph = s->graph;
   if (graph != nullptr) {
     graph->mu.lock();
@@ -2725,7 +2732,34 @@ TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map, const char* name,
 
   TF_Buffer* ret = TF_NewBuffer();
   status->status = MessageToBuffer(*api_def, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
   return ret;
 #endif  // __ANDROID__
 }
+
+TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status) {
+  tensorflow::KernelList kernel_list = tensorflow::GetAllRegisteredKernels();
+  TF_Buffer* ret = TF_NewBuffer();
+  status->status = MessageToBuffer(kernel_list, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
+  return ret;
+}
+
+TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
+  tensorflow::KernelList kernel_list =
+      tensorflow::GetRegisteredKernelsForOp(name);
+  TF_Buffer* ret = TF_NewBuffer();
+  status->status = MessageToBuffer(kernel_list, ret);
+  if (!status->status.ok()) {
+    TF_DeleteBuffer(ret);
+    return nullptr;
+  }
+  return ret;
+}
 }  // end extern "C"
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 1eb75ef11ff337dfcb2e016e09804fc04662fcda..c8ae6f2dd1780c4fe50ff1924be8d2e9a7502cf0 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -44,6 +44,7 @@ limitations under the License.
 // * size_t is used to represent byte sizes of objects that are
 //   materialized in the address space of the calling process.
 // * int is used as an index into arrays.
+// * Deletion functions are safe to call on nullptr.
 //
 // Questions left to address:
 // * Might at some point need a way for callers to provide their own Env.
@@ -1521,6 +1522,13 @@ TF_CAPI_EXPORT extern const char* TF_DeviceListType(const TF_DeviceList* list,
 TF_CAPI_EXPORT extern int64_t TF_DeviceListMemoryBytes(
     const TF_DeviceList* list, int index, TF_Status* status);
 
+// Retrieve the incarnation number of a given device.
+//
+// If index is out of bounds, an error code will be set in the status object,
+// and 0 will be returned.
+TF_CAPI_EXPORT extern uint64_t TF_DeviceListIncarnation(
+    const TF_DeviceList* list, int index, TF_Status* status);
+
 // --------------------------------------------------------------------------
 // Load plugins containing custom ops and kernels
 
@@ -1603,6 +1611,18 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_ApiDefMapGet(TF_ApiDefMap* api_def_map,
                                                  size_t name_len,
                                                  TF_Status* status);
 
+// --------------------------------------------------------------------------
+// Kernel definition information.
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// registered kernels.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
+
+// Returns a serialized KernelList protocol buffer containing KernelDefs for all
+// kernels registered for the operation named `name`.
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
+    const char* name, TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 610274696f5940c063e68f2310cfd9cc1e0bd964..f7ca219c896b2a7c07fc4d0739c70f2666652672 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1516,7 +1516,8 @@ void DefineStatefulFunction(const char* name, TF_Function** func) {
 
   TF_Output inputs[] = {};
   TF_Output outputs[] = {{random, 0}};
-  *func = TF_GraphToFunction(func_graph.get(), name, /*append_hash=*/false, -1,
+  *func = TF_GraphToFunction(func_graph.get(), name,
+                             /*append_hash_to_fn_name=*/false, -1,
                              /*opers=*/nullptr, 0, inputs, 1, outputs,
                              /*output_names=*/nullptr,
                              /*opts=*/nullptr, "", s.get());
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index bc04b53fbb7fa9ba46228ae5a4ec8ee96df5f3dc..e674b1623cf540eb8024d9be5ed8d77aa2fe17ba 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -29,9 +29,11 @@ limitations under the License.
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def.pb_text.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -1424,6 +1426,29 @@ TEST(CAPI, SavedModelNullArgsAreValid) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, DeletingNullPointerIsSafe) {
+  TF_Status* status = TF_NewStatus();
+
+  TF_DeleteStatus(nullptr);
+  TF_DeleteBuffer(nullptr);
+  TF_DeleteTensor(nullptr);
+  TF_DeleteSessionOptions(nullptr);
+  TF_DeleteGraph(nullptr);
+  TF_DeleteImportGraphDefOptions(nullptr);
+  TF_DeleteImportGraphDefResults(nullptr);
+  TF_DeleteFunction(nullptr);
+  TF_DeleteSession(nullptr, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeletePRunHandle(nullptr);
+  TF_DeleteDeprecatedSession(nullptr, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteDeviceList(nullptr);
+  TF_DeleteLibraryHandle(nullptr);
+  TF_DeleteApiDefMap(nullptr);
+
+  TF_DeleteStatus(status);
+}
+
 REGISTER_OP("TestOpWithNoGradient")
     .Input("x: T")
     .Output("y: T")
@@ -2312,6 +2337,57 @@ TEST(TestApiDef, TestCreateApiDefWithOverwrites) {
   TF_DeleteLibraryHandle(lib);
 }
 
+class DummyKernel : public tensorflow::OpKernel {
+ public:
+  explicit DummyKernel(tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(tensorflow::OpKernelContext* context) override {}
+};
+
+// Test we can query kernels
+REGISTER_OP("TestOpWithSingleKernel")
+    .Input("a: float")
+    .Input("b: float")
+    .Output("o: float");
+REGISTER_KERNEL_BUILDER(
+    Name("TestOpWithSingleKernel").Device(tensorflow::DEVICE_CPU), DummyKernel);
+
+TEST(TestKernel, TestGetAllRegisteredKernels) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf = TF_GetAllRegisteredKernels(status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_GT(kernel_list.kernel_size(), 0);
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
+TEST(TestKernel, TestGetRegisteredKernelsForOp) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf =
+      TF_GetRegisteredKernelsForOp("TestOpWithSingleKernel", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "TestOpWithSingleKernel");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
+TEST(TestKernel, TestGetRegisteredKernelsForOpNoKernels) {
+  TF_Status* status = TF_NewStatus();
+  TF_Buffer* kernel_list_buf = TF_GetRegisteredKernelsForOp("Unknown", status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  KernelList kernel_list;
+  kernel_list.ParseFromArray(kernel_list_buf->data, kernel_list_buf->length);
+  ASSERT_EQ(kernel_list.kernel_size(), 0);
+  TF_DeleteBuffer(kernel_list_buf);
+  TF_DeleteStatus(status);
+}
+
 #undef EXPECT_TF_META
 
 }  // namespace
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 6c510536d6f2a586b91baf96fa41b779db2c8d35..7321b4b791ffa722e9d3c7722c43297b0eae1eab 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -288,7 +288,7 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
                          opts->async, std::move(device_mgr), r);
 }
 
-void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
+void TFE_DeleteContext(TFE_Context* ctx) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
@@ -336,7 +336,7 @@ TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
 }
 
 void TFE_DeleteTensorHandle(TFE_TensorHandle* h) {
-  DCHECK(h);
+  if (h == nullptr) return;
   if (h->handle) {
     h->handle->Unref();
   }
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index fdbd5374b2afe815c3a81b453930eb8f1fa351d3..ea019a5711c1bbd4547819e976acf98fc06ecbde 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -102,8 +102,7 @@ typedef struct TFE_Context TFE_Context;
 
 TF_CAPI_EXPORT extern TFE_Context* TFE_NewContext(
     const TFE_ContextOptions* opts, TF_Status* status);
-TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx,
-                                             TF_Status* status);
+TF_CAPI_EXPORT extern void TFE_DeleteContext(TFE_Context* ctx);
 TF_CAPI_EXPORT extern TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx,
                                                             TF_Status* status);
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3504a8b5e78480732d3454097c1b2197ac2b2e17..0bdea70fe6b53ec374d856984741b211258b1d13 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -49,7 +49,7 @@ void BM_InitOp(int iters) {
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -80,7 +80,7 @@ void BM_Execute(int iters, int async) {
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -95,7 +95,7 @@ TEST(CAPI, Context) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const int num_devices = TF_DeviceListCount(devices);
@@ -195,7 +195,7 @@ void TestRemoteExecute(bool async) {
   TFE_DeleteOp(matmul);
 
   TFE_ContextAsyncWait(ctx, status);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TF_DeleteStatus(status);
@@ -281,7 +281,7 @@ void TestRemoteExecuteSilentCopies(bool async) {
   TFE_DeleteOp(matmul);
 
   TFE_ContextAsyncWait(ctx, status);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TF_DeleteStatus(status);
@@ -380,8 +380,7 @@ void TensorHandleCopyBetweenDevices(bool async) {
   TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenDevices) {
@@ -418,7 +417,7 @@ void TensorHandleCopyBetweenDevicesError(bool async) {
   TFE_DeleteTensorHandle(hcopy);
   TFE_DeleteTensorHandle(hcpu);
   if (hdevice != nullptr) TFE_DeleteTensorHandle(hdevice);
-  TFE_DeleteContext(ctx, status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenDevicesError) {
@@ -451,7 +450,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
-    TFE_DeleteContext(ctx, status.get());
+    TFE_DeleteContext(ctx);
     return;
   }
   const string gpu_1_name(TF_DeviceListName(devices, 1, status.get()));
@@ -484,8 +483,7 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeleteDeviceList(devices);
   TF_DeleteTensor(t);
   TFE_DeleteTensorHandle(hcpu);
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleCopyBetweenTwoGPUDevices) {
@@ -533,8 +531,7 @@ void TensorHandleSilentCopy(bool async) {
   TFE_DeleteTensorHandle(hcpu);
   TFE_ContextAsyncWait(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 
 TEST(CAPI, TensorHandleSilentCopy) { TensorHandleSilentCopy(false); }
@@ -580,8 +577,7 @@ void TensorHandleSilentCopyLocal(bool async) {
   TFE_DeleteTensorHandle(hcpu);
   TFE_ContextAsyncWait(ctx, status.get());
   EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TFE_DeleteContext(ctx, status.get());
-  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
 }
 TEST(CAPI, TensorHandleSilentCopyLocal) { TensorHandleSilentCopyLocal(false); }
 TEST(CAPI, TensorHandleSilentCopyLocalAsync) {
@@ -614,7 +610,7 @@ void SetAndGetOpDevices(bool async) {
 
   TFE_DeleteOp(matmul);
   TFE_DeleteTensorHandle(m);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -640,7 +636,7 @@ void Execute_MatMul_CPU(bool async) {
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -712,7 +708,7 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_DeleteTensorHandle(m1);
   TFE_DeleteTensorHandle(m2);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_MatMul_CPU_Runtime_Error) {
@@ -743,7 +739,7 @@ void Execute_MatMul_CPU_Type_Error(bool async) {
   if (retvals[0] != nullptr) {
     TFE_DeleteTensorHandle(retvals[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 
@@ -781,7 +777,7 @@ TEST(CAPI, Execute_Min_CPU) {
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -823,7 +819,7 @@ void Execute_MatMul_XLA_CPU(bool async) {
   EXPECT_EQ(10, product[1]);
   EXPECT_EQ(15, product[2]);
   EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); }
@@ -862,7 +858,7 @@ void Execute_Min_XLA_CPU(bool async) {
   TF_DeleteTensor(t);
   EXPECT_EQ(1, output[0]);
   EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   TF_DeleteStatus(status);
 }
 TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); }
@@ -898,7 +894,7 @@ void ExecuteWithTracing(bool async) {
 
   TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
   TFE_DeleteTensorHandle(retvals[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   float product[4] = {0};
   EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
@@ -974,7 +970,7 @@ TEST(CAPI, Function_ident_CPU) {
     TF_DeleteTensor(r);
     TFE_DeleteTensorHandle(result[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1044,7 +1040,7 @@ TEST(CAPI, Function_ident_XLA_CPU) {
     TF_DeleteTensor(r);
     TFE_DeleteTensorHandle(result[0]);
   }
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1120,7 +1116,7 @@ void FunctionDefAndExecute(bool async) {
   EXPECT_EQ(10, product[1]);
   EXPECT_EQ(15, product[2]);
   EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1161,7 +1157,7 @@ void BM_ExecuteFunction(int iters, int async) {
   tensorflow::testing::StopTiming();
   TFE_DeleteTensorHandle(m);
   TFE_DeleteTensorHandle(retval[0]);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1249,7 +1245,7 @@ TEST(CAPI, Variables) {
 
   TFE_DeleteTensorHandle(var_handle);
   TFE_DeleteTensorHandle(value_handle);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
@@ -1288,7 +1284,7 @@ void BM_ReadVariable(int iters) {
   TFE_DeleteOp(op);
 
   TFE_DeleteTensorHandle(var_handle);
-  TFE_DeleteContext(ctx, status);
+  TFE_DeleteContext(ctx);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TF_DeleteStatus(status);
 }
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index fd7b6fe6625f27bda92e2f56f60908658cdecd7e..1c9bdff5e1295135abe60c282d565c39071fd78a 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -475,11 +475,7 @@ TEST_F(CWiseUnaryGradTest, Tan_Complex) {
   auto x_fn = [this](const int i) {
     return CRV({{1, 0}, {0, 1}, {2, -1}, {1, 2}, {3, 4}});
   };
-  // TODO(kbsriram)
-  // Enable when tan kernel supports complex inputs
-  if (false) {
-    TestCWiseGrad<complex64, complex64>(TAN, x_fn);
-  }
+  TestCWiseGrad<complex64, complex64>(TAN, x_fn);
 }
 
 TEST_F(CWiseUnaryGradTest, Atan) {
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index c73482d5f4d13ade0dc0412941251d1651371b6e..588e96cb196189780037f66266484962ba0385e4 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -47,6 +47,72 @@ Status SoftmaxGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("Softmax", SoftmaxGrad);
 
+bool IsZero(const Scope& scope, const Output& grad) {
+  string op_type_name = grad.op().node()->type_string();
+  if (op_type_name == "ZerosLike" || op_type_name == "Zeros") {
+    return true;
+  }
+  // The Operation we were provided is not named something obvious so
+  // we need to actually look at its contents.
+  // The original python code did this by calling a utility function called
+  // tensor_util.constant_value.
+  // There is no C++ equivalent to tensor_util.constant_value so we do nothing
+  // for the moment.
+  return false;
+}
+
+// Multiply after broadcasting vec to match dimensions of mat.
+//   Args:
+//     vec: A 1-D tensor of dimension [D0]
+//     mat: A 2-D tensor of dimesnion [D0, D1]
+//
+//   Returns:
+//     A tensor of dimension [D0, D1], the result fo vec * mat.
+Output BroadcastMul(const Scope& scope, const Output& vec, const Output& mat) {
+  auto reshaped = ExpandDims(scope, vec, -1);
+  return Multiply(scope, reshaped, mat);
+}
+
+Status SoftmaxCrossEntropyWithLogitsGrad(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  // Softmax gradient with cross entropy logits function.
+  // We multiply the backprop for cost with the gradients - op.output[1].
+  // There is no gradient for labels.
+
+  // The outputs of the network are at input index 0.
+  auto logits = op.input(0);
+  // The "truth" labels are at index 1.
+  auto softmax_grad = op.output(1);
+
+  // The loss is the output at index 0, and backprop is the output at index 1.
+  auto grad_loss = grad_inputs[0];
+  auto grad_grad = grad_inputs[1];
+
+  auto grad = BroadcastMul(scope, grad_loss, softmax_grad);
+  if (!IsZero(scope, grad_grad)) {
+    std::vector<int> axis;
+    auto logits_softmax = Softmax(scope, logits);
+
+    auto grad_grad_expand = ExpandDims(scope, grad_grad, 1);
+    auto logits_softmax_expand = ExpandDims(scope, logits_softmax, 2);
+    auto matmul_result =
+        BatchMatMul(scope, grad_grad_expand, logits_softmax_expand);
+    axis.push_back(1);
+    auto squeeze_result = Squeeze(scope, matmul_result, Squeeze::Axis(axis));
+    auto subtraction_result = Subtract(scope, grad_grad, squeeze_result);
+    auto multiply_result = Multiply(scope, subtraction_result, logits_softmax);
+    grad = Add(scope, grad, multiply_result);
+  }
+  auto minus_log_softmax = Multiply(scope, LogSoftmax(scope, logits), -1.0f);
+  grad_outputs->push_back(grad);
+  grad_outputs->push_back(BroadcastMul(scope, grad_loss, minus_log_softmax));
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("SoftmaxCrossEntropyWithLogits",
+                     SoftmaxCrossEntropyWithLogitsGrad);
+
 Status LogSoftmaxGrad(const Scope& scope, const Operation& op,
                       const std::vector<Output>& grad_inputs,
                       std::vector<Output>* grad_outputs) {
@@ -195,9 +261,9 @@ Status MaxPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   MaxPool3DGrad::Attrs grad_attrs;
-  auto dx = MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      MaxPool3DGrad(scope, op.input(0), op.output(0), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -216,10 +282,9 @@ Status AvgPoolGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   internal::AvgPoolGrad::Attrs grad_attrs;
-  auto dx =
-      internal::AvgPoolGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                            ksize, strides, padding,
-                            grad_attrs.DataFormat(data_format));
+  auto dx = internal::AvgPoolGrad(scope, Shape(scope, op.input(0)),
+                                  grad_inputs[0], ksize, strides, padding,
+                                  grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
@@ -238,9 +303,9 @@ Status AvgPool3DGradHelper(const Scope& scope, const Operation& op,
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "padding", &padding));
   TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "data_format", &data_format));
   AvgPool3DGrad::Attrs grad_attrs;
-  auto dx = AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0],
-                          ksize, strides, padding,
-                          grad_attrs.DataFormat(data_format));
+  auto dx =
+      AvgPool3DGrad(scope, Shape(scope, op.input(0)), grad_inputs[0], ksize,
+                    strides, padding, grad_attrs.DataFormat(data_format));
   grad_outputs->push_back(dx);
   return scope.status();
 }
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index b4d457a9d14eb79232cda9412fa0050f6a9968cc..aa72cf7ba2a958f54d50b59f0edaefb27edf0e86 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -25,6 +25,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ops::AvgPool;
+using ops::AvgPool3D;
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
@@ -33,11 +35,9 @@ using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
-using ops::AvgPool;
-using ops::AvgPool3D;
 using ops::MaxPool;
-using ops::MaxPoolV2;
 using ops::MaxPool3D;
+using ops::MaxPoolV2;
 using ops::Placeholder;
 using ops::Relu;
 using ops::Relu6;
@@ -111,6 +111,20 @@ TEST_F(NNGradTest, SoftmaxGrad) {
   RunTest(x, shape, y, shape);
 }
 
+TEST_F(NNGradTest, SoftmaxCrossEntropyWithLogitsGrad) {
+  TensorShape logits_shape({5, 3});
+  TensorShape loss_shape({5});
+
+  auto logits = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape));
+  auto labels = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(logits_shape));
+  auto y =
+      tensorflow::ops::SoftmaxCrossEntropyWithLogits(scope_, logits, labels);
+  // Note the reversal of the backprop and loss orders. Issue #18734 has been
+  // opened for this.
+  RunTest({logits, labels}, {logits_shape, logits_shape}, {y.backprop, y.loss},
+          {logits_shape, loss_shape});
+}
+
 TEST_F(NNGradTest, LogSoftmaxGrad) {
   TensorShape shape({5, 3});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
@@ -253,7 +267,7 @@ TEST_F(NNGradTest, AvgPool3DGradHelper) {
   RunTest(x, x_shape, y, y_shape);
 }
 
-TEST_F(NNGradTest, LRN){
+TEST_F(NNGradTest, LRN) {
   TensorShape x_shape({1, 1, 2, 1});
   auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
   auto y = LRN(scope_, x);
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 730b1b669b52a9d87fb4045471ce5b931312093e..3d3895c8fa82c3c0e2974228e9cad767d0e00df4 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -39,9 +39,20 @@ cc_library(
     hdrs = ["reader.h"],
     deps = [
         ":constants",
+    ] + if_not_mobile([
+        # TODO(b/111634734): :lib and :protos_all contain dependencies that
+        # cannot be built on mobile platforms. Instead, include the appropriate
+        # tf_lib depending on the build platform.
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-    ],
+    ]) + if_mobile([
+        # Mobile-friendly SavedModel proto. See go/portable-proto for more info.
+        "//tensorflow/core:saved_model_portable_proto",
+    ]) + if_android([
+        "//tensorflow/core:android_tensorflow_lib",
+    ]) + if_ios([
+        "//tensorflow/core:ios_tensorflow_lib",
+    ]),
 )
 
 tf_cc_test(
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index 07807ed2f3e10cb03ec363562087a40c9e86fc5e..98be66a6add67a8053e286521e564286cdb8ef8d 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -74,6 +74,54 @@ void AddAssetsTensorsToInputs(const StringPiece export_dir,
   }
 }
 
+// Like Session::Run(), but uses the Make/Run/ReleaseCallable() API to avoid
+// leaving behind non-GC'ed state.
+//
+// Detailed motivation behind this approach, from ashankar@:
+//
+// Each call to Session::Run() that identifies a new subgraph (based on feeds
+// and fetches) creates some datastructures that live as long as the session
+// (the partitioned graph, associated executors etc.).
+//
+// A pathological case of this would be if say the initialization op
+// (main_op/legacy_init_op) involves the use of a large constant. Then we
+// allocate memory for that large constant that will just stick around till the
+// session dies. With this Callable mechanism, that memory will be released
+// right after ReleaseCallable returns.
+//
+// However, the resource manager state remains.
+Status RunOnce(const RunOptions& run_options,
+               const std::vector<std::pair<string, Tensor>>& inputs,
+               const std::vector<string>& output_tensor_names,
+               const std::vector<string>& target_node_names,
+               std::vector<Tensor>* outputs, RunMetadata* run_metadata,
+               Session* session) {
+  CallableOptions callable_options;
+  std::vector<Tensor> feed_tensors;
+  *callable_options.mutable_run_options() = run_options;
+  for (const auto& input : inputs) {
+    const string& name = input.first;
+    const Tensor& tensor = input.second;
+    callable_options.add_feed(name);
+    feed_tensors.push_back(tensor);
+  }
+  for (const string& output_tensor_name : output_tensor_names) {
+    callable_options.add_fetch(output_tensor_name);
+  }
+  for (const string& target_node_name : target_node_names) {
+    callable_options.add_target(target_node_name);
+  }
+
+  Session::CallableHandle callable_handle;
+  TF_RETURN_IF_ERROR(session->MakeCallable(callable_options, &callable_handle));
+  const Status run_status = session->RunCallable(callable_handle, feed_tensors,
+                                                 outputs, run_metadata);
+  // Be sure to call ReleaseCallable() regardless of the outcome of
+  // RunCallable().
+  session->ReleaseCallable(callable_handle).IgnoreError();
+  return run_status;
+}
+
 bool HasMainOp(const MetaGraphDef& meta_graph_def) {
   const auto& collection_def_map = meta_graph_def.collection_def();
   if (collection_def_map.find(kSavedModelMainOpKey) !=
@@ -86,10 +134,11 @@ bool HasMainOp(const MetaGraphDef& meta_graph_def) {
 Status RunMainOp(const RunOptions& run_options, const string& export_dir,
                  const MetaGraphDef& meta_graph_def,
                  const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session) {
-  LOG(INFO) << "Running MainOp on SavedModel bundle.";
+                 Session* session, const string& main_op_key) {
+  LOG(INFO) << "Running MainOp with key " << main_op_key
+            << " on SavedModel bundle.";
   const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(kSavedModelMainOpKey);
+  const auto main_op_it = collection_def_map.find(main_op_key);
   if (main_op_it != collection_def_map.end()) {
     if (main_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
@@ -99,8 +148,8 @@ Status RunMainOp(const RunOptions& run_options, const string& export_dir,
     AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
     RunMetadata run_metadata;
     const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {}, {main_op_name.ToString()},
-                        nullptr /* outputs */, &run_metadata);
+    return RunOnce(run_options, inputs, {}, {main_op_name.ToString()},
+                   nullptr /* outputs */, &run_metadata, session);
   }
   return Status::OK();
 }
@@ -137,32 +186,8 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
   AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
 
   RunMetadata run_metadata;
-  return session->Run(run_options, inputs, {}, {restore_op_name.ToString()},
-                      nullptr /* outputs */, &run_metadata);
-}
-
-Status RunLegacyInitOp(const RunOptions& run_options, const string& export_dir,
-                       const MetaGraphDef& meta_graph_def,
-                       const std::vector<AssetFileDef>& asset_file_defs,
-                       Session* session) {
-  LOG(INFO) << "Running LegacyInitOp on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto init_op_it = collection_def_map.find(kSavedModelLegacyInitOpKey);
-  if (init_op_it != collection_def_map.end()) {
-    if (init_op_it->second.node_list().value_size() != 1) {
-      return errors::FailedPrecondition(strings::StrCat(
-          "Expected exactly one serving init op in : ", export_dir));
-    }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece legacy_init_op_name =
-        init_op_it->second.node_list().value(0);
-    return session->Run(run_options, inputs, {},
-                        {legacy_init_op_name.ToString()}, nullptr /* outputs */,
-                        &run_metadata);
-  }
-  return Status::OK();
+  return RunOnce(run_options, inputs, {}, {restore_op_name.ToString()},
+                 nullptr /* outputs */, &run_metadata, session);
 }
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
@@ -204,11 +229,11 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
   if (HasMainOp(bundle->meta_graph_def)) {
     TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
                                  bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get()));
+                                 bundle->session.get(), kSavedModelMainOpKey));
   } else {
-    TF_RETURN_IF_ERROR(RunLegacyInitOp(run_options, export_dir,
-                                       bundle->meta_graph_def, asset_file_defs,
-                                       bundle->session.get()));
+    TF_RETURN_IF_ERROR(RunMainOp(
+        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
+        bundle->session.get(), kSavedModelLegacyInitOpKey));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 2119c8ec47f941a76e81346ae5d20da78eae11a3..fef8b8d4d4cdcc97a913ae2ba6d1a8b0b0084f89 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -68,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:compile_only_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index bbc35da2ef6d14ff0d3570ef2d5cf6743456c674..2b5f97b34cd928d32eb220536342c715d91d45bb 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 9174a67cc6d110ac21c7bb09346bb1b2dfad0579..e34347b9d4e31be1b37b7ef1cb30911dd290ea7b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -166,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/jit/deadness_analysis.cc b/tensorflow/compiler/jit/deadness_analysis.cc
index b2d119029a4e2ab82a87968266b04e3a66f6fdfb..d81e5fe9008975c126bcd8e0ea7cef19f1eb1bf3 100644
--- a/tensorflow/compiler/jit/deadness_analysis.cc
+++ b/tensorflow/compiler/jit/deadness_analysis.cc
@@ -44,10 +44,6 @@ class Predicate {
   enum class Kind { kAnd, kOr, kNot, kSymbol };
 
   virtual string ToString() const = 0;
-  virtual bool operator==(const Predicate& other) const = 0;
-  virtual bool operator!=(const Predicate& other) const {
-    return !(*this == other);
-  }
   int64 hash() const { return hash_; }
 
   virtual Kind kind() const = 0;
@@ -58,6 +54,8 @@ class Predicate {
 
  private:
   const int64 hash_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Predicate);
 };
 
 int64 HashPredicateSequence(Predicate::Kind kind,
@@ -69,19 +67,6 @@ int64 HashPredicateSequence(Predicate::Kind kind,
   return hash;
 }
 
-bool PredicateSequenceEqual(gtl::ArraySlice<Predicate*> lhs,
-                            gtl::ArraySlice<Predicate*> rhs) {
-  if (lhs.size() != rhs.size()) {
-    return false;
-  }
-  for (int64 i = 0; i < lhs.size(); i++) {
-    if (*lhs[i] != *rhs[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-
 // Represents a logical conjunction of a set of predicates.
 class AndPredicate : public Predicate {
  public:
@@ -102,17 +87,9 @@ class AndPredicate : public Predicate {
     return strings::StrCat("(", str_util::Join(operands_str, " & "), ")");
   }
 
-  bool operator==(const Predicate& other) const override {
-    return other.kind() == Kind::kAnd &&
-           PredicateSequenceEqual(
-               dynamic_cast<const AndPredicate&>(other).operands(), operands());
-  }
-
   Kind kind() const override { return Kind::kAnd; }
 
-  const tensorflow::gtl::ArraySlice<Predicate*> operands() const {
-    return operands_;
-  }
+  const gtl::ArraySlice<Predicate*> operands() const { return operands_; }
 
  private:
   std::vector<Predicate*> operands_;
@@ -138,16 +115,8 @@ class OrPredicate : public Predicate {
     return strings::StrCat("(", str_util::Join(operands_str, " | "), ")");
   }
 
-  bool operator==(const Predicate& other) const override {
-    return other.kind() == Kind::kOr &&
-           PredicateSequenceEqual(
-               dynamic_cast<const OrPredicate&>(other).operands(), operands());
-  }
-
   Kind kind() const override { return Kind::kOr; }
-  const tensorflow::gtl::ArraySlice<Predicate*> operands() const {
-    return operands_;
-  }
+  const gtl::ArraySlice<Predicate*> operands() const { return operands_; }
 
  private:
   std::vector<Predicate*> operands_;
@@ -164,11 +133,6 @@ class NotPredicate : public Predicate {
     return strings::StrCat("~", operand()->ToString());
   }
 
-  bool operator==(const Predicate& other) const override {
-    return other.kind() == Kind::kNot &&
-           *dynamic_cast<const NotPredicate&>(other).operand() == *operand();
-  }
-
   Kind kind() const override { return Kind::kNot; }
   Predicate* operand() const { return operand_; }
 
@@ -188,14 +152,6 @@ class SymbolPredicate : public Predicate {
         must_be_true_(must_be_true) {}
 
   string ToString() const override { return tensor_id_.ToString(); }
-  bool operator==(const Predicate& other) const override {
-    return other.kind() == Kind::kSymbol &&
-           must_be_true() ==
-               dynamic_cast<const SymbolPredicate&>(other).must_be_true() &&
-           dynamic_cast<const SymbolPredicate&>(other).tensor_id() ==
-               tensor_id();
-  }
-
   Kind kind() const override { return Kind::kSymbol; }
 
   // If `must_be_true()` is true this SymbolPredicate represents the proposition
@@ -225,16 +181,37 @@ class PredicateFactory {
   Predicate* MakeAndPredicate(gtl::ArraySlice<Predicate*> operands) {
     return MakeAndOrImpl(operands, /*is_and=*/true);
   }
+
   Predicate* MakeOrPredicate(gtl::ArraySlice<Predicate*> operands) {
     return MakeAndOrImpl(operands, /*is_and=*/false);
   }
 
   Predicate* MakeNotPredicate(Predicate* pred) {
-    return Make<NotPredicate>(pred);
+    SignatureForNot signature = pred;
+    auto it = interned_not_instances_.find(signature);
+    if (it == interned_not_instances_.end()) {
+      std::unique_ptr<Predicate> new_pred = Make<NotPredicate>(pred);
+      Predicate* new_pred_ptr = new_pred.get();
+      interned_not_instances_.emplace(signature, std::move(new_pred));
+      return new_pred_ptr;
+    } else {
+      return it->second.get();
+    }
   }
 
   Predicate* MakeSymbolPredicate(TensorId tensor_id, bool must_be_true) {
-    return Make<SymbolPredicate>(tensor_id, must_be_true);
+    SignatureForSymbol signature = {tensor_id, must_be_true};
+    auto it = interned_symbol_instances_.find(signature);
+    if (it == interned_symbol_instances_.end()) {
+      std::unique_ptr<Predicate> new_pred =
+          Make<SymbolPredicate>(tensor_id, must_be_true);
+      Predicate* new_pred_ptr = new_pred.get();
+      interned_symbol_instances_.emplace(std::move(signature),
+                                         std::move(new_pred));
+      return new_pred_ptr;
+    } else {
+      return it->second.get();
+    }
   }
 
   Predicate* MakeTrue() { return MakeAndPredicate({}); }
@@ -242,29 +219,53 @@ class PredicateFactory {
 
  private:
   template <typename PredicateT, typename... Args>
-  Predicate* Make(Args... args) {
-    std::unique_ptr<PredicateT> pred(
+  std::unique_ptr<Predicate> Make(Args&&... args) {
+    return std::unique_ptr<PredicateT>(
         new PredicateT(std::forward<Args>(args)...));
-    predicate_storage_.emplace_back(std::move(pred));
-    return predicate_storage_.back().get();
   }
 
   Predicate* MakeAndOrImpl(gtl::ArraySlice<Predicate*> operands, bool is_and);
 
-  struct PredicatePtrHash {
-    size_t operator()(const Predicate* pred) const { return pred->hash(); }
+  // Predicate instances are interned, meaning that there is only a single
+  // instance of a Predicate object with a given content.  This makes checking
+  // for structural equality super-cheap -- we can just compare pointers.
+  //
+  // We intern predicates by maintaining a map from the content of a Predicate
+  // to the only instance of said predicate we allow to exist in the
+  // interned_and_or_instances_, interned_not_instances_ and
+  // interned_symbol_instances_ fields.  These maps also double up as storage
+  // for the owning pointers to predicate instances.
+
+  using SignatureForAndOr =
+      std::pair<Predicate::Kind, gtl::ArraySlice<Predicate*>>;
+  using SignatureForNot = Predicate*;
+  using SignatureForSymbol = std::pair<SafeTensorId, bool>;
+
+  struct HashSignatureForAndOr {
+    size_t operator()(const SignatureForAndOr& signature) const {
+      size_t hash = ::tensorflow::hash<Predicate::Kind>()(signature.first);
+      for (Predicate* p : signature.second) {
+        hash = Hash64Combine(hash, ::tensorflow::hash<Predicate*>()(p));
+      }
+      return hash;
+    }
   };
 
-  struct PredicatePtrEq {
-    size_t operator()(const Predicate* a, const Predicate* b) const {
-      return *a == *b;
+  struct HashSignatureForSymbol {
+    size_t operator()(const SignatureForSymbol& signature) const {
+      return Hash64Combine(SafeTensorId::Hasher()(signature.first),
+                           ::tensorflow::hash<bool>()(signature.second));
     }
   };
 
-  using PredicateSet =
-      gtl::FlatSet<Predicate*, PredicatePtrHash, PredicatePtrEq>;
-
-  std::vector<std::unique_ptr<Predicate>> predicate_storage_;
+  gtl::FlatMap<SignatureForAndOr, std::unique_ptr<Predicate>,
+               HashSignatureForAndOr>
+      interned_and_or_instances_;
+  gtl::FlatMap<SignatureForNot, std::unique_ptr<Predicate>>
+      interned_not_instances_;
+  gtl::FlatMap<SignatureForSymbol, std::unique_ptr<Predicate>,
+               HashSignatureForSymbol>
+      interned_symbol_instances_;
 };
 
 // Common code to create AndPredicate or OrPredicate instances.
@@ -272,7 +273,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(gtl::ArraySlice<Predicate*> operands,
                                            bool is_and) {
   Predicate::Kind pred_kind =
       is_and ? Predicate::Kind::kAnd : Predicate::Kind::kOr;
-  PredicateSet simplified_ops_set;
+  gtl::FlatSet<Predicate*> simplified_ops_set;
   std::vector<Predicate*> simplified_ops;
   for (Predicate* op : operands) {
     // Simplify A&A => A and  A|A => A.
@@ -300,7 +301,7 @@ Predicate* PredicateFactory::MakeAndOrImpl(gtl::ArraySlice<Predicate*> operands,
   }
 
   // Simplify "A&~A=>False" and "A|~A=>True".
-  PredicateSet negated_ops;
+  gtl::FlatSet<Predicate*> negated_ops;
   for (Predicate* op : simplified_ops) {
     if (op->kind() == Predicate::Kind::kNot) {
       negated_ops.insert(dynamic_cast<NotPredicate&>(*op).operand());
@@ -317,8 +318,26 @@ Predicate* PredicateFactory::MakeAndOrImpl(gtl::ArraySlice<Predicate*> operands,
       simplified_ops.begin(), simplified_ops.end(),
       [](Predicate* a, Predicate* b) { return a->hash() < b->hash(); });
 
-  return is_and ? Make<AndPredicate>(std::move(simplified_ops))
-                : Make<OrPredicate>(std::move(simplified_ops));
+  auto it = interned_and_or_instances_.find({pred_kind, simplified_ops});
+  if (it == interned_and_or_instances_.end()) {
+    simplified_ops.shrink_to_fit();
+    // NB!  Because we'll use a non-owning reference to simplified_ops in the
+    // key for interned_and_or_instances_ we need to be careful to std::move()
+    // it all the way through.
+    gtl::ArraySlice<Predicate*> operands_slice = simplified_ops;
+    std::unique_ptr<Predicate> new_pred =
+        is_and ? Make<AndPredicate>(std::move(simplified_ops))
+               : Make<OrPredicate>(std::move(simplified_ops));
+
+    Predicate* new_pred_ptr = new_pred.get();
+    CHECK(interned_and_or_instances_
+              .emplace(SignatureForAndOr(pred_kind, operands_slice),
+                       std::move(new_pred))
+              .second);
+    return new_pred_ptr;
+  } else {
+    return it->second.get();
+  }
 }
 
 class DeadnessAnalysisImpl : public DeadnessAnalysis {
@@ -491,8 +510,9 @@ bool DeadnessAnalysisImpl::HasInputsWithMismatchingDeadness(const Node& node) {
 
     // Today we just compare the predicates for equality (with some
     // canonicalization/simplification happening before) but we could be more
-    // sophisticated here if need be.
-    if (pred != nullptr && *pred != *it->second) {
+    // sophisticated here if need be.  Comparing pointers is sufficient because
+    // we intern Predicate instances by their content.
+    if (pred != nullptr && pred != it->second) {
       if (vlog_) {
         VLOG(2) << "HasInputsWithMismatchingDeadness(" << node.name()
                 << ") -> true";
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 73db0d595284b39f1073e7be9b755aa7c72b2200..38eb6d830f4d4e889810acd0f928e93d0b22bde8 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
@@ -463,6 +464,12 @@ Status MarkForCompilationPass::Run(
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
   const FunctionLibraryDefinition* fld = options.flib_def;
 
+  std::unique_ptr<DeadnessAnalysis> deadness;
+  {
+    XLA_SCOPED_LOGGING_TIMER_LEVEL("DeadnessAnalysis", 1);
+    TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(**options.graph, &deadness));
+  }
+
   auto is_compilable = [&](const Node* node, const DeviceType& device_type) {
     const XlaOpRegistry::DeviceRegistration* registration;
     if (!XlaOpRegistry::GetCompilationDevice(device_type.type(),
@@ -470,15 +477,6 @@ Status MarkForCompilationPass::Run(
       return false;
     }
 
-    // TODO(b/111570009): This bailout for ControlTrigger is probably not
-    // needed.
-    //
-    // Don't compile control trigger nodes. We won't preserve their deadness
-    // semantics correctly, so it's safest not to compile them.
-    if (node->IsControlTrigger()) {
-      return false;
-    }
-
     // If this device requires a JIT, we must say yes.
     if (registration->requires_compilation) return true;
 
@@ -490,6 +488,14 @@ Status MarkForCompilationPass::Run(
     status = fld->GetAttr(*node, kXlaCompileAttr, &compile);
     if (status.ok()) return compile;
 
+    // If inputs to `node` can have conflicting deadness (i.e. some are alive
+    // and some are dead) then don't compile it.  XLA cannot represent the
+    // deadness semantics of these nodes correctly and auto-clustering these
+    // nodes can cause deadness to propagate to nodes that should be live.
+    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
+      return false;
+    }
+
     // Check for fusable ops only if requested.
     if (global_jit_level > 0 && fusion_only && !IsXlaFusable(node->def())) {
       return false;
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 772c92d369e67f431b5d030d1d5cdc5ae2700d39..2c5f4fb774fcab082c0d0d316cdc6757cacc1e96 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -680,5 +681,37 @@ TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
   EXPECT_EQ(clusters, expected_clusters);
 }
 
+TEST(XlaCompilationTest, ClusterControlTrigger) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output recv_a = ops::_Recv(root.WithOpName("recv_a"), DT_BOOL, "tensor_a",
+                             "sender", 0, "receiver");
+  Output recv_b = ops::_Recv(root.WithOpName("recv_b"), DT_BOOL, "tensor_b",
+                             "sender", 0, "receiver");
+  Output const_a = ops::Const(root.WithOpName("const_a"), 42);
+
+  ops::ControlTrigger ctrl_trigger_a(root.WithOpName("ctrl_trigger_a"));
+  ops::ControlTrigger ctrl_trigger_b(root.WithOpName("ctrl_trigger_b"));
+  root.graph()->AddControlEdge(recv_a.node(), ctrl_trigger_a.operation.node());
+  root.graph()->AddControlEdge(recv_b.node(), ctrl_trigger_a.operation.node());
+  root.graph()->AddControlEdge(ctrl_trigger_b.operation.node(), const_a.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  // ctrl_trigger_a has inputs with mismatching deadness so it won't be
+  // clustered.  ctrl_trigger_b is okay to cluster.
+  std::unordered_map<string, string> expected_clusters(
+      {{"const_a", cluster_name}, {"ctrl_trigger_b", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 7ed609c43748062656b631243c01d790519c54fd..54a41a4daa790401c797277e7eaab531dd34ac80 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -40,7 +40,23 @@ namespace tensorflow {
 XlaCompilationCache::XlaCompilationCache(xla::LocalClient* client,
                                          DeviceType device_type)
     : client_(client), device_type_(std::move(device_type)) {}
-XlaCompilationCache::~XlaCompilationCache() = default;
+XlaCompilationCache::~XlaCompilationCache() {
+  // Ensure any use of our programs have completed by waiting for all stream
+  // executors to complete.
+  for (auto* executor : client_->backend().stream_executors()) {
+    bool ok = executor->SynchronizeAllActivity();
+    if (!ok) {
+      LOG(ERROR) << "Error synchronizing activity while waiting for all "
+                    "programs to complete";
+    }
+  }
+  // TODO(b/110813685): Think about the program ownership model. Programs are
+  // currently owned by the compilation cache which means we must wait for
+  // program completion in the destructor. There are multiple compilation caches
+  // around, which complicates things a little. Perhaps having programs be
+  // shared_ptrs (an invasive change) would make the model easier to reason
+  // about?
+}
 
 string XlaCompilationCache::DebugString() {
   return "XLA JIT compilation cache";
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index fccdb143680353ccbe3106bd48aa297980179d55..4a5942fbd7f5bfd28e1ec96c6b0dc9e28dd418c5 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -153,17 +154,17 @@ class XlaDevice : public LocalDevice {
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
   // computations enqueued by XLA.
-  xla::Backend::StreamPtr stream_;
+  xla::StreamPool::Ptr stream_;
   // If true, only stream_ is valid and all computation and transfers use
   // stream_. If false, computation is performed by stream_ and transfers are
   // performed by host_to_device/device_to_host_stream.
   bool use_multiple_streams_;
   // If use_multiple_streams_, host to device transfers are performed using this
   // stream.
-  xla::Backend::StreamPtr host_to_device_stream_;
+  xla::StreamPool::Ptr host_to_device_stream_;
   // If use_multiple_streams_, device to host transfers are performed using this
   // stream.
-  xla::Backend::StreamPtr device_to_host_stream_;
+  xla::StreamPool::Ptr device_to_host_stream_;
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index 04778c00904d484aa55884b86f10759ea3c3df20..8cf198239c84c3720585f53ebc95876ce4396793 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -74,43 +74,64 @@ Status XlaTransferManager::TransferLiteralToDevice(
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
                                            host_tensor.shape(), &xla_shape));
-  xla::BorrowingLiteral literal(
+  // Create a reference to hold onto host_tensor until after the literal has
+  // been transferred. Also make sure the literal exists until the function
+  // asynchronously completes, as it will be wrapped in an xla::LiteralSlice.
+  TensorReference ref(host_tensor);
+  auto literal = std::make_shared<xla::BorrowingLiteral>(
       static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
 
   XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
   const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
-  VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+  VLOG(1) << "Transfer to device as literal: " << literal->ToString() << " "
           << shaped_buffer.ToString();
-  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDevice(
-      host_to_device_stream_, literal, shaped_buffer));
+  if (UseMultipleStreams()) {
+    // Initially wait for the compute stream so that memory allocations are
+    // synchronized.
+    host_to_device_stream_->ThenWaitFor(stream_);
+  }
+  TF_RETURN_IF_ERROR(transfer_manager_->TransferLiteralToDeviceAsync(
+      host_to_device_stream_, *literal, shaped_buffer));
   if (UseMultipleStreams()) {
     se::Event event(stream_->parent());
     TF_RET_CHECK(event.Init()) << "Event failed to initialize!";
     host_to_device_stream_->ThenRecordEvent(&event);
     xla_tensor->SetDefinedOn(host_to_device_stream_, std::move(event));
   }
+  // Unref the host tensor, and capture the literal shared_ptr too so it goes
+  // out of scope when the lambda completes.
+  host_to_device_stream_->ThenDoHostCallback([ref, literal]() { ref.Unref(); });
   return Status::OK();
 }
 
-Status XlaTransferManager::TransferLiteralFromDevice(
-    Tensor* host_tensor, const Tensor& device_tensor) const {
+void XlaTransferManager::TransferLiteralFromDevice(
+    Tensor* host_tensor, const Tensor& device_tensor,
+    const StatusCallback& done) const {
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(&device_tensor)->shaped_buffer();
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
-                      transfer_manager_->TransferLiteralFromDevice(
-                          device_to_host_stream_, shaped_buffer));
-  VLOG(1) << "Transfer from device as literal: " << literal->ToString() << " "
-          << shaped_buffer.ToString();
-  Tensor tensor;
-  TF_RETURN_IF_ERROR(
-      LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
-  // Reshape the tensor back to its declared shape.
-  if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
-    return errors::Internal(
-        "Tensor::CopyFrom failed when copying from XLA device to CPU");
-  }
-  return Status::OK();
+  TensorReference ref(device_tensor);
+  transfer_manager_->TransferLiteralFromDevice(
+      device_to_host_stream_, shaped_buffer,
+      [=, &shaped_buffer](
+          xla::StatusOr<std::unique_ptr<xla::Literal> > literal_or) {
+        ref.Unref();
+        done([&]() -> Status {
+          TF_ASSIGN_OR_RETURN(auto literal, std::move(literal_or));
+          VLOG(1) << "Transfer from device as literal: " << literal->ToString()
+                  << " " << shaped_buffer.ToString();
+          Tensor tensor;
+          TF_RETURN_IF_ERROR(
+              LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
+          // Reshape the tensor back to its declared shape.
+          Status status;
+          if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
+            status = errors::Internal(
+                "Tensor::CopyFrom failed when copying from XLA device to CPU");
+          }
+          return status;
+        }());
+      });
 }
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -163,6 +184,12 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       return;
     }
     status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
+    if (status.ok()) {
+      xla_tensor->set_host_tensor(*cpu_tensor);
+      host_to_device_stream_->ThenDoHostCallback(
+          [done]() { done(Status::OK()); });
+      return;
+    }
   } else {
     se::DeviceMemoryBase dev_dst_ptr =
         XlaTensor::DeviceMemoryFromTensor(*device_tensor);
@@ -212,7 +239,8 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
 
   Status status;
   if (transfer_as_literal_) {
-    status = TransferLiteralFromDevice(cpu_tensor, *device_tensor);
+    TransferLiteralFromDevice(cpu_tensor, *device_tensor, done);
+    return;
   } else {
     device_to_host_stream_->ThenMemcpy(dst_ptr, dev_src_ptr, total_bytes);
     // TODO(hpucha): Make this asynchronous.
@@ -234,15 +262,15 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
           << reinterpret_cast<const void*>(src_tensor.tensor_data().data())
           << " "
           << reinterpret_cast<const void*>(dst_tensor->tensor_data().data());
-  // TODO(phawkins): replace this code with an asynchronous implementation.
-  auto body = [&]() {
+  // Perform memory allocation now, and enqueue the device-to-device transfer.
+  Status status = [&]() -> Status {
     if (src_tensor.NumElements() == 0) {
       return Status::OK();
     }
     // TODO(jmolloy): We co-opt the device_to_host stream for device to device
     // transfers; perhaps we should have a dedicated device to device stream? or
     // one per device?
-    auto device_to_device_stream = device_to_host_stream_;
+    auto device_to_device_stream = stream_;
     XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
     XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
     CHECK(xla_src && xla_dst)
@@ -254,29 +282,40 @@ void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
       TF_RETURN_IF_ERROR(
           xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
                                         stream_->parent()->device_ordinal()));
+      if (stream_ != device_to_device_stream) {
+        // Initially wait for the compute stream so that memory allocations are
+        // synchronized.
+        device_to_device_stream->ThenWaitFor(stream_);
+      }
     }
 
     if (se::Event* event =
             xla_src->GetDefinitionEvent(device_to_device_stream)) {
       device_to_device_stream->ThenWaitFor(event);
       xla_src->SetDefinedOn(device_to_device_stream);
-      TF_RETURN_IF_ERROR(device_to_device_stream->BlockHostUntilDone());
     }
-    TF_RETURN_IF_ERROR(
-        xla_dst->shaped_buffer().buffers().ForEachMutableElementWithStatus(
-            [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
-              const se::DeviceMemoryBase& from_buffer =
-                  xla_src->shaped_buffer().buffers().element(index);
-              CHECK_EQ(buffer->size(), from_buffer.size());
-              if (!stream_->parent()->SynchronousMemcpy(buffer, from_buffer,
-                                                        buffer->size())) {
-                return errors::Internal("Device to device memcpy failed");
-              }
-              return Status::OK();
-            }));
+
+    auto from_iter = xla_src->shaped_buffer().buffers().begin();
+    auto to_iter = xla_dst->shaped_buffer().buffers().begin();
+    for (auto end_iter = xla_src->shaped_buffer().buffers().end();
+         from_iter != end_iter; ++from_iter, ++to_iter) {
+      device_to_device_stream->ThenMemcpyD2D(
+          &to_iter->second, from_iter->second, to_iter->second.size());
+    }
+
+    if (UseMultipleStreams()) {
+      se::Event event(stream_->parent());
+      CHECK(event.Init());
+      device_to_device_stream->ThenRecordEvent(&event);
+      xla_dst->SetDefinedOn(device_to_device_stream, std::move(event));
+    }
     return Status::OK();
-  };
-  done(body());
+  }();
+  if (!status.ok()) {
+    return done(status);
+  } else {
+    stream_->ThenDoHostCallback([=]() { done(Status::OK()); });
+  }
 }
 
 XlaDeviceContext::XlaDeviceContext(
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index c726495f96883138892655797ab21257623daf31..912f8d779e72f44821bc4fb25efa30bd35d01412 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -66,8 +66,9 @@ class XlaTransferManager {
  private:
   Status TransferLiteralToDevice(const Tensor& host_tensor,
                                  Tensor* device_tensor) const;
-  Status TransferLiteralFromDevice(Tensor* host_tensor,
-                                   const Tensor& device_tensor) const;
+  void TransferLiteralFromDevice(Tensor* host_tensor,
+                                 const Tensor& device_tensor,
+                                 const StatusCallback& done) const;
   bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; }
 
   // The main compute stream of the device, used to synchronize the transfer
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
index 74257b09a808a39454eace3b1a9bf57a2e071360..4b499b161371ecece14447b29fbf809b6e8857db 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 
+#include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/union_find.h"
@@ -146,6 +147,9 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
   TF_RETURN_IF_ERROR(
       ImportGraphDef(options, item.graph, &graph, &shape_refiner));
 
+  std::unique_ptr<DeadnessAnalysis> deadness;
+  TF_RETURN_IF_ERROR(DeadnessAnalysis::Run(graph, &deadness));
+
   // Collect nodes that can be fused via XLA, while ignoring those that
   // explicitly ask for XLA: (*) nodes that are marked to be compiled
   // explicitly. (*) nodes assigned to XLA device.
@@ -185,6 +189,14 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
       continue;
     }
 
+    // If inputs to `node` can have conflicting deadness (i.e. some are alive
+    // and some are dead) then don't compile it.  XLA cannot represent the
+    // deadness semantics of these nodes correctly and auto-clustering these
+    // nodes can cause deadness to propagate to nodes that should be live.
+    if (node->IsMerge() || deadness->HasInputsWithMismatchingDeadness(*node)) {
+      continue;
+    }
+
     compilation_candidates.insert(node);
   }
 
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 616c3ed2a26fd30478fcee0d9b4d52a58cdb0921..6134b8c6946429918a5ca37188cbff13a6cd1c79 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -64,11 +64,13 @@ xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   AllocationAttributes attrs;
   attrs.no_retry_on_failure = !retry_on_failure;
-  void* data =
-      wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
-  if (data == nullptr) {
-    return errors::ResourceExhausted("Out of memory while trying to allocate ",
-                                     size, " bytes.");
+  void* data = nullptr;
+  if (size != 0) {
+    data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
+    if (data == nullptr) {
+      return errors::ResourceExhausted(
+          "Out of memory while trying to allocate ", size, " bytes.");
+    }
   }
   return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
                                  device_ordinal, this);
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 90531174ff149bc6144fdba9b6463b8ef5b885f6..1ea3fa4cf29266e8c452385226e56bd0b82622d9 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -122,7 +122,11 @@ class XlaTensorBuffer : public TensorBuffer {
     data_ = const_cast<void*>(ptr);
   }
 
-  ~XlaTensorBuffer() override { allocator_->DeallocateRaw(data_); }
+  ~XlaTensorBuffer() override {
+    if (data_) {
+      allocator_->DeallocateRaw(data_);
+    }
+  }
 
   void* data() const override { return data_; }
   size_t size() const override { return expected_size_; }
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index 5dff187fffd4065559e98653b3e3c160d9ea4e8b..d777dfa5a34fb9615ddcf393ed53be1491cb70af 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -92,10 +92,8 @@ se::Event* XlaTensor::GetDefinitionEvent(se::Stream* stream) {
 
 void XlaTensor::SetDefinedOn(se::Stream* stream, se::Event event) {
   mutex_lock lock(mu_);
-  CHECK(!definition_event_.has_value())
-      << "SetDefinedOn must only be called once!";
   definition_event_ = std::move(event);
-  streams_defined_on_.push_back(stream);
+  streams_defined_on_ = {stream};
 }
 
 void XlaTensor::SetDefinedOn(se::Stream* stream) {
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9cb3d0454608c37e669d5b4360bc39bf1bf7e68c..0aafda7fb4d710f154157ee352d6616e5aa8935f 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -691,11 +691,13 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([[10], [7], [2]], dtype=np.float32),
           np.float32(7),
           expected=np.array([[False], [False], [True]], dtype=np.bool))
-      self._testBinary(
-          less_op,
-          np.array([[10], [7], [2], [-1]], dtype=np.int64),
-          np.int64(7),
-          expected=np.array([[False], [False], [True], [True]], dtype=np.bool))
+      if np.int64 in self.numeric_types:
+        self._testBinary(
+            less_op,
+            np.array([[10], [7], [2], [-1]], dtype=np.int64),
+            np.int64(7),
+            expected=np.array(
+                [[False], [False], [True], [True]], dtype=np.bool))
 
     for less_equal_op in [math_ops.less_equal, (lambda x, y: x <= y)]:
       self._testBinary(
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index ff002d15b09f5a9dc1a69741672503656f893298..338943201bb11a66370d82f301736a0d8d0fc7ed 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -119,6 +119,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/core:lib",
@@ -139,14 +140,12 @@ cc_library(
         "xla_op_registry.cc",
         "xla_resource.cc",
         "xla_cpu_backend.cc",
-        "legacy_flags/backend_registration_flags.cc",
     ] + if_cuda_is_configured([
         "xla_gpu_backend.cc",
     ]),
     hdrs = [
         "const_analysis.h",
         "graph_compiler.h",
-        "legacy_flags/backend_registration_flags.h",
         "xla_compilation_device.h",
         "xla_compiler.h",
         "xla_context.h",
@@ -172,16 +171,14 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
-        "//tensorflow/compiler/xla/legacy_flags:parse_flags_from_env",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -294,6 +291,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 03603ee9baefd1d20d220faf63c9c1c427ebdf31..24616c01c7e54b2e8662457ca6af23a0bc563e08 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -33,7 +33,7 @@ struct NameCounts {
   std::unordered_map<string, int> counts;
 };
 
-string MakeUniquePath(string name) {
+string MakeUniqueFilename(string name) {
   static NameCounts& instance = *new NameCounts;
 
   // Remove illegal characters from `name`.
@@ -50,26 +50,41 @@ string MakeUniquePath(string name) {
     count = instance.counts[name]++;
   }
 
-  legacy_flags::DumpGraphFlags* flags = legacy_flags::GetDumpGraphFlags();
-  string path = strings::StrCat(flags->tf_dump_graph_prefix, "/", name);
+  string filename = name;
   if (count > 0) {
-    strings::StrAppend(&path, "_", count);
+    strings::StrAppend(&filename, "_", count);
   }
-  strings::StrAppend(&path, ".pbtxt");
-  return path;
+  strings::StrAppend(&filename, ".pbtxt");
+  return filename;
+}
+
+string WriteTextProtoToUniqueFile(
+    Env* env, const string& name, const char* proto_type,
+    const ::tensorflow::protobuf::Message& proto) {
+  const string& dirname =
+      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
+  Status status = env->RecursivelyCreateDir(dirname);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
+                 << proto_type << ": " << status;
+    return "(unavailable)";
+  }
+  string filepath = strings::StrCat(dirname, "/", MakeUniqueFilename(name));
+  status = WriteTextProto(Env::Default(), filepath, proto);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
+                 << " : " << status;
+    return "(unavailable)";
+  }
+  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
+  return filepath;
 }
 
 }  // anonymous namespace
 
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  string path = MakeUniquePath(name);
-  Status status = WriteTextProto(Env::Default(), path, graph_def);
-  if (!status.ok()) {
-    VLOG(1) << "Failed to dump GraphDef to file: " << path << " : " << status;
-    path.clear();
-    path = "(unavailable)";
-  }
-  return path;
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
+                                    graph_def);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
@@ -83,15 +98,7 @@ string DumpGraphToFile(const string& name, Graph const& graph,
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  string path = MakeUniquePath(name);
-  Status status = WriteTextProto(Env::Default(), path, fdef);
-  if (!status.ok()) {
-    VLOG(1) << "Failed to dump FunctionDef to file: " << path << " : "
-            << status;
-    path.clear();
-    path = "(unavailable)";
-  }
-  return path;
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 6cc95149a16a59fce8486c5d103ad09e3e262765..0904778f97c95628c81054cd4bc2ff32ff440a33 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -177,8 +177,8 @@ Status CheckNoCycleContains(const Node* node, const int num_nodes) {
     visited[current_node->id()] = true;
     for (const Edge* out : current_node->out_edges()) {
       if (out->dst() == node) {
-        return errors::Internal("Detect a cycle: Node \"", node->name(), "\"(",
-                                node->def().op(), ") feeds into itself.");
+        return errors::Internal("Detected a cycle: ", FormatNodeForError(*node),
+                                "(", node->def().op(), ") feeds into itself.");
       } else if (!visited[out->dst()->id()]) {
         ready.push_back(out->dst());
       }
@@ -324,7 +324,7 @@ Status AddMissingFunctionDef(const FunctionDef& fdef,
     if (library->Find(node.op())) {
       continue;
     }
-    // The function refered by 'SymbolicGradient' node is specified in its
+    // The function referred by 'SymbolicGradient' node is specified in its
     // attribute 'f'.
     if (node.op() == FunctionLibraryDefinition::kGradientOp) {
       const AttrValue* attr =
@@ -437,22 +437,24 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
           continue;
         }
         if (enter_merge != nullptr) {
-          return errors::Internal(
-              "Enter node for loop-varying argument ", arg.enter->name(),
-              " has multiple successors: ", enter_merge->dst()->name(), " and ",
-              e->dst()->name());
+          return errors::Internal("Enter node for loop-varying argument ",
+                                  FormatNodeForError(*arg.enter),
+                                  " has multiple successors: ",
+                                  FormatNodeForError(*enter_merge->dst()),
+                                  " and ", FormatNodeForError(*e->dst()));
         }
         enter_merge = e;
       }
       if (enter_merge == nullptr) {
         return errors::Internal("Enter node for loop-varying argument ",
-                                arg.enter->name(), " has zero successors");
+                                FormatNodeForError(*arg.enter),
+                                " has zero successors");
       }
       arg.merge = enter_merge->dst();
       if (!IsMerge(arg.merge)) {
         return errors::InvalidArgument(
             "Successor of Enter node for loop-varying argument ",
-            arg.merge->name(),
+            FormatNodeForError(*arg.merge),
             " is not a Merge node; got: ", arg.merge->type_string());
       }
 
@@ -462,7 +464,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
         return errors::InvalidArgument(
             "Unexpected number of inputs to Merge node for loop-varying "
             "argument ",
-            arg.merge->name(), "; expected 2, got ",
+            FormatNodeForError(*arg.merge), "; expected 2, got ",
             arg.merge->input_types().size());
       }
       TF_RETURN_IF_ERROR(arg.merge->input_node(1 - enter_merge->dst_input(),
@@ -470,7 +472,7 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
       if (!IsNextIteration(arg.next_iteration)) {
         return errors::InvalidArgument(
             "Expected NextIteration node as input to Merge node; got node ",
-            arg.next_iteration->name(), " with kind ",
+            FormatNodeForError(*arg.next_iteration), " with kind ",
             arg.next_iteration->type_string());
       }
 
@@ -481,14 +483,14 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
             switches.find(edge->dst()) != switches.end()) {
           if (arg.switch_node != nullptr) {
             return errors::InvalidArgument("Duplicate Switch successors to ",
-                                           arg.merge->name());
+                                           FormatNodeForError(*arg.merge));
           }
           arg.switch_node = edge->dst();
         }
       }
       if (arg.switch_node == nullptr) {
         return errors::InvalidArgument("Missing Switch successor to ",
-                                       arg.merge->name());
+                                       FormatNodeForError(*arg.merge));
       }
 
       // Update the device on the Identity outputs of the switch to match their
@@ -516,14 +518,15 @@ Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
         possible_exit.pop_front();
         if (IsExit(edge->dst())) {
           if (arg.exit != nullptr) {
-            return errors::InvalidArgument("Duplicate Exit successors to ",
-                                           arg.switch_node->name());
+            return errors::InvalidArgument(
+                "Duplicate Exit successors to ",
+                FormatNodeForError(*arg.switch_node));
           }
           arg.exit = edge->dst();
         } else {
           if (!IsIdentity(edge->dst())) {
             return errors::Unimplemented("General graph between switch (",
-                                         arg.switch_node->name(),
+                                         FormatNodeForError(*arg.switch_node),
                                          ") and exit node of frame ",
                                          frame->name, " not supported yet.");
           }
@@ -1470,7 +1473,7 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
   if (!unreachable_nodes.empty()) {
     return errors::InvalidArgument(
         "The following nodes are unreachable from the source in the graph: ",
-        tensorflow::str_util::Join(unreachable_nodes, ", "));
+        errors::FormatNodeNamesForError(unreachable_nodes));
   }
 
   // Builds Frames, indexed by name.
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index aae2f8ee5acd6249f8b6002d94c877f18064f936..ccf249b35d66861888ad5e5e904b5f63b8ac50a1 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -1064,7 +1064,10 @@ TEST(FunctionalizeControlFlow, Cycle) {
   // less -> XlaIf <--> identity.
   Status status = FunctionalizeControlFlow(graph.get(), &library);
   EXPECT_FALSE(status.ok());
-  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Detect a cycle"))
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "Detected a cycle"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node cond/Less_5_If}}"))
       << status.error_message();
 }
 
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index e1cea03865ce9978e634429b5ce41fe8b245a575..e4fdf0a6186eb69a2e3413838c91616b992ef2d6 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -29,7 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d88a34dfd9bf7e9a5a9e65004b87833dd35de668..f96483d23d693a1c94661d3bed8fd277fd46b979 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -123,12 +123,13 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/compiler/xla/client/lib:prng",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -164,7 +165,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -180,7 +182,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -217,8 +219,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:argmax_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index e33532828040123243f839ab1aa655b4bbc72520..41a453da80dec6b6f57a4d222e2c33ef6b786a10 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index c4af79281d2162b1dbfb0a7881720892f4bc49d2..b3ad0aea84eef601de08909f760699b8700d28f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 26130fd9e7fce75c6d2a5a53cfc85842cf762b35..48f2a005ab16651fe29d0f6f9d881f95693da461 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index e9b2c0b16d39cb3b747c0316621fb01de709b12e..41f540506ba41fbe7f91393e7b8e26a89e72ef0a 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/tensor_format.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index d6d4ae89376b67c14af8ef4f3a608fcc83b6fb59..2c328102e0bd84709707f102272691b6aec9a577 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
index efbdb76eaaf78904fe783a018940b1b096ec39bd..5078f8662bd397eaa51274ec816c130b8ced92cc 100644
--- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index 62eebf762b3e063da8ec456cc4726d3cc9b77d1d..8cc2479dd555380da7500abe6b2aca380110333b 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 1784e712b56145bbdff5f1daa2e031b65d0774b6..e7fef77edcba0ea5a521956a704225ac4f7fcb22 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
index 4e6d33304c4ae08a0fd1e0a8373267a527087528..547fe48046e8c934e3bc14d02c8448e107c1a406 100644
--- a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index e3a32a5c0e2f93237c8c7ebeea3668b5d1ab6c23..f4106051043859a6786705009d76b02a64cd3ff1 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index f4360d8c3f6fc4007c31fdcfd7f7634de15c76d4..da8cf3fc6fa694f592280f8c249d317827d9cd09 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 48ac4867edcef97be001a24f42f6a35225d466c9..5da7972397b32fb4a2f216913e065c04131a3773 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
index 500a564f3f0489a42dbc9d5b70ae7708a7a43973..db579a5b35d69deb3dca578e31c1b54fada76342 100644
--- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index 9ff3e0222831cb4339943966810eeae451e47a2c..ef1015552d181a183d412f9c269dd5ec608b388f 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/bcast.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 4f92dbc8740b697322424058530b8477c35d809a..a5b870f8dbf70bcee331992345d63fd5d986bdca 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index f3149200250935629a6e4bf67bff0c048135ce3e..12b0e38288e8f222ed506a75ec2575f27141c859 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 6dec414c53bee6b0102e229c86cfafb4072a35f0..ed44ad218b6dc073583ec339da082b6881ad672d 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -123,8 +123,6 @@ class DiagPartOp : public XlaOpKernel {
   explicit DiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* builder = ctx->builder();
-
     const TensorShape input_shape = ctx->InputShape(0);
     auto dims = input_shape.dim_sizes();
 
@@ -150,37 +148,13 @@ class DiagPartOp : public XlaOpKernel {
       new_dims.push_back(dims[i]);
     }
 
-    xla::XlaOp diag = ctx->Input(0);
-
-    // TODO(b/30878775): use Slice with strides when supported, in place of
-    // the Pad -> Reshape -> Slice.
-
-    // Picture:
-    // [[1, 0, 0, 0]  pad and reshape to [[1, 0, 0, 0, 0],
-    //  [0, 2, 0, 0]  =================>  [2, 0, 0, 0, 0],
-    //  [0, 0, 3, 0]                      [3, 0, 0, 0, 0],
-    //  [0, 0, 0, 4]]                     [4, 0, 0, 0, 0]]
-    // and then slice out the first column.
-
-    // Flattens the input to 1D.
-    int64 size = input_shape.num_elements();
-    diag = xla::Reshape(diag, {size});
-
-    // Adds padding after the last element of 'new_size'.
-    xla::PaddingConfig config;
-    auto* dim = config.add_dimensions();
-    dim->set_edge_padding_high(new_size);
-    auto zero = XlaHelpers::Zero(builder, input_type(0));
-    diag = xla::Pad(diag, zero, config);
-
-    // Reshapes so the diagonal is now in the first column.
-    diag = xla::Reshape(diag, {new_size, new_size + 1});
+    xla::XlaOp input = ctx->Input(0);
 
-    // Slices out the first column and reshapes to the final shape.
-    diag = xla::Slice(diag, {0, 0}, {new_size, 1}, {1, 1});
-    diag = xla::Reshape(diag, new_dims);
+    xla::XlaOp output = xla::Reshape(
+        xla::GetMatrixDiagonal(xla::Reshape(input, {new_size, new_size})),
+        new_dims);
 
-    ctx->SetOutput(0, diag);
+    ctx->SetOutput(0, output);
   }
 };
 
@@ -220,8 +194,6 @@ class MatrixDiagPartOp : public XlaOpKernel {
   explicit MatrixDiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::XlaBuilder* builder = ctx->builder();
-
     const TensorShape input_shape = ctx->InputShape(0);
     auto dims = input_shape.dim_sizes();
 
@@ -229,71 +201,8 @@ class MatrixDiagPartOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 2 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::XlaOp diag = ctx->Input(0);
-
-    int last_dim = dims.size() - 1;
-    int64 last_dim_size = dims[last_dim];
-
-    // The smaller of the last two dimension sizes.
-    int64 smaller_dim_size = std::min(dims[last_dim - 1], dims[last_dim]);
-
-    // TODO(b/30878775): use Slice with strides when supported, in place of
-    // the Pad -> Reshape -> Slice.
-
-    // Picture: for each 2D matrix in the tensor's last two dimensions:
-    // [[1, 0, 0, 0]  pad and reshape to [[1, 0, 0, 0, 0],
-    //  [0, 2, 0, 0]  =================>  [2, 0, 0, 0, 0],
-    //  [0, 0, 3, 0]]                     [3, 0, 0, 0, 0],
-    // and then slice out the first column.
-    //
-    // Another example, with tall and narrow input.
-    // [[1, 0]  pad and reshape to [[1, 0, 0],
-    //  [0, 2]  =================>  [2, 0, 0]]
-    //  [0, 0]
-    //  [0, 0]]
-
-    // Collapses the last two dimensions.
-    std::vector<int64> flattened_dims(dims.begin(), dims.end() - 1);
-    flattened_dims.back() *= dims.back();
-    diag = xla::Reshape(diag, flattened_dims);
-
-    // Slices or pads the last dimension to 'target_size'.
-    int64 actual_size = flattened_dims.back();
-    int64 target_size = smaller_dim_size * (last_dim_size + 1);
-    if (actual_size < target_size) {
-      xla::PaddingConfig config =
-          xla::MakeNoPaddingConfig(flattened_dims.size());
-      auto* dim = config.mutable_dimensions(flattened_dims.size() - 1);
-      dim->set_edge_padding_high(target_size - actual_size);
-      auto zero = XlaHelpers::Zero(builder, input_type(0));
-      diag = xla::Pad(diag, zero, config);
-    } else if (actual_size > target_size) {
-      std::vector<int64> start(flattened_dims.size(), 0);
-      std::vector<int64> limits(flattened_dims.begin(), flattened_dims.end());
-      std::vector<int64> strides(flattened_dims.size(), 1);
-      limits[flattened_dims.size() - 1] = target_size;
-      diag = xla::Slice(diag, start, limits, strides);
-    }
-
-    // Reshape so the target values are in the first position of the last
-    // dimension.
-    std::vector<int64> unflattened_dims(dims.begin(), dims.end());
-    dims[last_dim - 1] = smaller_dim_size;
-    dims[last_dim] = last_dim_size + 1;
-    diag = xla::Reshape(diag, dims);
-
-    // Slices out the first column and reshapes to the final shape.
-    std::vector<int64> start(dims.size(), 0);
-    std::vector<int64> limits(dims.begin(), dims.end());
-    std::vector<int64> strides(dims.size(), 1);
-    limits[last_dim] = 1;
-    diag = xla::Slice(diag, start, limits, strides);
-
-    // Collapses away the last dimension.
-    dims.pop_back();
-    diag = xla::Reshape(diag, dims);
-
-    ctx->SetOutput(0, diag);
+    xla::XlaOp input = ctx->Input(0);
+    ctx->SetOutput(0, xla::GetMatrixDiagonal(input));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index 3b86ea34c9e7d943eb9c7de222e0a2be049ebc68..a3389d5b905bf3ee15744ab4fcee193d312e2ae0 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index 958231505b50431b9bb267b0a3cc5ed56e3aeb21..cb73053666d4c32bc0a2ef19b174aee1a29f101e 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index 81f42e504e4b6f813a29769719a7a7fb5d99b9c5..5fdb1d972c55efb876972d3f472b53a1f7cde1c2 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 65d42a302fca48c7b5f88813f80e975823f63ddf..c68b0bfd7961892294c2931e5c4c44de534a7740 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 2fd1a34741e1c7235397f9a69dd8444b4679fa22..cdba6680dee3fade5bdf0c453ed672b653072b0d 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index b2b00e51e3b00fa93c258af489cf0f4a3e6e764b..80bcef966360ec9a1ca63a02741108ce41b31846 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index 95faa1d058f4c0d3fa802b157c6daba1e1adaf41..54b21a278229024e3e54e9135548be6b69b077e1 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 5f041be5df226ed996b21844c0cf92b6dfac005c..35de96e0aab847fa39ef26d5f3052c392062fd7d 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index d898e43b858bac706d524c7c271f48b1b5fa258f..92346283c31dfe1d638526ac4b26ef762cd7fd14 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index e2160feba00a7272635207ebcb53670cacf34620..ceb2af756c2d2020c7449086b957c9fbc1cc2979 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index cb4caf7bcb4caaa1bf7e0e79e52bb966a8838db3..6e061ba278c99af69cdaadf84b0266ce0998ed33 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index d6bf92fb3df8d38909df99e11c85ede4fac2bf81..8d75624e74028ea083c3facc4f9578ec14c50e6d 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/math/math_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index 9e64711051d31107db1bf6f1966f9ed6f5630c34..f028e361bccd51de0bd69a1d2227c7afaed53455 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/no_op.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index 2fb072f827906d40dcf410f0312394c4f568a28d..a11bbe918f7f8eb050aaa40d4344f9cc9e9a10a4 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index dc934543cb2f94fbe1e8f1f865156eb082d6a127..87ee2d3aede50eb24e65570f106d49030e1d4236 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 844080b8cf5462da201ce7671e4f9d02fa52c861..6440770c29894c951f010f6c1deb929f4fe79bbf 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -54,10 +54,14 @@ class MatMulOp : public XlaOpKernel {
     const TensorShape b_shape = ctx->InputShape(1);
 
     // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a_shape),
-                errors::InvalidArgument("In[0] is not a matrix"));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b_shape),
-                errors::InvalidArgument("In[1] is not a matrix"));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(a_shape),
+        errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ",
+                                a_shape.DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(b_shape),
+        errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ",
+                                b_shape.DebugString()));
     int first_index = transpose_a_ ? 0 : 1;
     int second_index = transpose_b_ ? 1 : 0;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index e06c87db7adb1840606208fe15cd68a3ca4d137a..8dfd7de591c4a3c4768dd60b41e03d294ad49397 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index e2ab4b83cfb45b2f9a7f3aba2d2a927d10ad8b85..c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 529959dbd90b05f8860360f70e087ef225150600..eedfc3c9140d7b1ccc1944611de98c1d49fbdaf2 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/mirror_pad_mode.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index 3aed47de2603f3e187ad515d4db3f884da4c6cc8..a9b519d8928cc2807831fd6b4f12e60b7d58ea55 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 89fd610bc63349d008836c3c4e6ec8927c232a54..e5937b56c17d01892928b073da09f38941ea1bbb 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 12d9cb9bac6b98c8f4c3edc3f6c661acf4466f98..3d506e71e03d6b804d1ea0e63c760cfb82629f12 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index e88221e4f400abeec59d85c1539d4f70bf515d3c..6f4ed496a1774dde68dd9d5fbd37995d615b678c 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 607cad798a98cfa0c6161a8154001926384e724e..2da9340625db08b14b78340c471f096baf15689d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index 76bd1e62aa1efd85d6ed489b9a6d22a2bacf2a8b..b11a4ce36da9907ce8fe377c075023a4540797fa 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index be7f2bce8cb249aa51ca091e02da7dffc7d06743..0d260fa8fcaa513d7854c1e9215952404d555c70 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 8333f9b288e27efe9497306f031980c9eec7c99c..466e79828d111ee7cadcf713703e8f252c63e62c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index ed1d1c661091fd9bc443336626e593e728b82830..b52f0a0ab6290f2019bb58120be5c2364ec15bb6 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -19,7 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index f4b804e54677c7226d8d3429c9e8c27686d19ccf..d35777ccb1271ec6a7c9972c714d06b2415d9c34 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index 354fec9be75e9559b204e2afd6ee08dfc7cea872..121750a82a8c5cbe940068555ad273b7e0d22dfc 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 5be70a4ded31a988cb77cdabe3fc8a041bc3ad16..1911e6ea362f999c787cbf95dcc9137a6a630273 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index ec15b4cc7a523d5b8d4287bbe3321433f315063b..d962ef4a5f53470838643541f8a1e693d2f4011c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index c810456f94322acfccae18d78efa861eede4648c..03a50ef8a059e5a005c4cc2e5e98acedfea8619a 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 27ab3e1bf5b81a901e64a242e5eb343591481efe..ab094d7dd1ce9856a3c2854fd2776827d6c4b76f 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 14709bb6cbce4b3ae0f7ff859b0fa622c6eda293..f1f32699fee5f03f603f830722fe65622dee5d3e 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index e2ac7da2c2630725efe3dbcc51c3f3d30e7aca2c..b22ecb7c6dbb42a33a4f4d90b18b20816df16a50 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 5c010c9df23ba6c7732d87fa014879d93ff586ce..6ce50efb4aa6e3434a7c6009cf9f52f6cff9cc9f 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index 6281d6c6533f7f49a269f5c7e52226ba0f1d29f6..a7f5a8f1698b9d02560de427d356e9e6be5caa7c 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 5798823cd54c66dd179e3611c0041f7c5a1ff2b5..4e0cf99d8e7ff45ed9145981b5e2e637ce4d4e4b 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 1864584adee357ce35a3e8a38a4e3c58c356bfca..6adc3c58de63ee70c26bed47eebef955893df4a5 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index a71fbcd901e8919949db5873675a7e3e785bdf4e..1d7a63dc311c60927f460e281601963e21232ec7 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -20,7 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
index faaf8964ff7c40d75a493b03e6b400632117cb45..aaeeae01ccb303091a6d37d1aeb4b2a3377dc638 100644
--- a/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sort_ops.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 8a8525efa186ed4aa02c494f7505f6245677e96e..7327258c31f21f45ff7ffffbc9db7a2a70b4a14c 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 47d282fe9ec664bbc424793e93f778ebb13c6877..4493539fe34f0ce635fdc58660d4ff90af9c9379 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 242638f981198ffd7a9c5b5f6365168de59a1f85..93fc14e9efca868e84444dd0e07d7f0dfa84c042 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index cc4b13d3b933cdc15efd94d3ce7a353a856bcb88..5412e135478361d08965e4621ec52cfb4a792f1d 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index c2165ccd86dfa1c119790beb20af0844fb1bbda8..1062399d91bd9a9bf8c3820c5ecac534c110746d 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 26326f18b844fa9dc48aeedfa5dcff3d09033a18..be1814d8e3ae2c0ddad0134b9288e0ea084aa81b 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index c9e56942625a009fb3660f413a845547192460d5..1233a37565d3a40c6dd2882b3139dedbf690a7b6 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 82d4a69777b06cc3dec1ceb1a0a4163dcb1e4667..e73fb283b093908e529d42f9ec5a5a891dc1fb36 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 98df73024962b8009a74976d473df752d590b47a..be5e91138656716daddcc3c7a68dbb78ecb69103 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index 6c721c48fe3af45aff5cd0bd5e74e2693faf9f97..f9148b394212777271f9eba51313ee17b19819af 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index e6ec794cfd4103f622f64a113464c2f4cbfd4215..0bdfc05726105e2d18362a691cbe2aab00bf77f3 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
index f951127bb95cd52864af869676a6b4c4961c1a43..8671632976023fded04c26a9780c1a67638b0916 100644
--- a/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unpack_op.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index bb27b5d56f3c24dc093a60e698b1080dfb76514d..2c92a585f5679242d672d0402e617ff199b94f17 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 009fdd81b22dcf54e2ed6ee539414a9f5cafb52d..1e8a376765d36ffa677ece06fbd131744299e04b 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc b/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc
deleted file mode 100644
index 661505021f820e2a87a5d414c6fe382bf6153045..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's backend registration modules.
-
-#include <mutex>  // NOLINT
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h"
-#include "tensorflow/compiler/xla/legacy_flags/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static BackendRegistrationFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new BackendRegistrationFlags;
-  flags->tf_enable_prng_ops_gpu = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_enable_prng_ops_gpu", &flags->tf_enable_prng_ops_gpu,
-           "Whether to enable PRNG ops: [RandomStandardNormal | RandomUniform "
-           "| RandomUniformInt | TruncatedNormal] on GPU."),
-  });
-  xla::legacy_flags::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// backend registration modules.
-void AppendBackendRegistrationFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the BackendRegistrationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendRegistrationFlags* GetBackendRegistrationFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h b/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h
deleted file mode 100644
index 861c923dd51f90be2acbeb23911a93e873aabdce..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
-
-// Legacy flags for the XLA bridge's backend registration modules.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// backend registration modules.
-void AppendBackendRegistrationFlags(std::vector<tensorflow::Flag>* append_to);
-
-// The values of flags associated with the XLA bridge's backend registration
-// module.
-typedef struct {
-  // Whether to enable RandomUniform op on GPU backend.
-  // TODO (b/32333178): Remove this flag or set its default to true.
-  bool tf_enable_prng_ops_gpu;
-} BackendRegistrationFlags;
-
-// Return a pointer to the BackendRegistrationFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-BackendRegistrationFlags* GetBackendRegistrationFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LEGACY_FLAGS_BACKEND_REGISTRATION_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 30039e256a184197109b334c5b0e0ca4770dbb2d..cb7a40e23d539f758d963791f1c2b4d37374ade5 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -44,9 +44,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -59,9 +59,9 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -78,12 +78,12 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -100,9 +100,9 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -119,10 +119,10 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -142,7 +142,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -162,8 +162,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -200,8 +200,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 3c4eec081ba9744226cfbd8d5392220cbf7276f3..f666d22ea44216beef74608bb4d9f33fb2fe82c6 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index d07a9486f18c0b8f26782123a8fba4ba228f71ee..8757b16a1ca6a8cec5e3c801c885e7bbbb2f2c76 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 35b137aa2cc0b5e6c2d2b917c0a95410522305c2..87d73eb3f07ebd7dfa4fef50ebe76cad0c4ed117 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 0f6e0e9d152ec5daedeb9c0e355bfb9731759094..1bef9bb166c576ec665bb48265b4da200ddca2a0 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index 9c8ac7af25e4222f35bedd3816fc817af7e1f068..fc0c1ee838190b1f1a7ca5b901c97e0a35232a97 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/tf2xla/lib/qr.h b/tensorflow/compiler/tf2xla/lib/qr.h
index 3aa6a9b07539487b954b2d8c8d0e0bbcc49c2b42..abd2316ac961f583dd29f90f43cf6209de30bd6a 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.h
+++ b/tensorflow/compiler/tf2xla/lib/qr.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_QR_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/lib/random.cc b/tensorflow/compiler/tf2xla/lib/random.cc
index 8ff10fbd3fbf9308140af84c752a5a50bec8fd32..5e7cf00ee5e063aef36a9531ff87d8fe6928ca1f 100644
--- a/tensorflow/compiler/tf2xla/lib/random.cc
+++ b/tensorflow/compiler/tf2xla/lib/random.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/lib/random.h b/tensorflow/compiler/tf2xla/lib/random.h
index 2c573fd85b2783fdac13457cdb277cf988ac40c4..59fc5d0433a51328bc78006ab1c3495d908b44ac 100644
--- a/tensorflow/compiler/tf2xla/lib/random.h
+++ b/tensorflow/compiler/tf2xla/lib/random.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_RANDOM_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/framework/types.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 6a5be1c2be57726d6c0e226407d52e7bfcebf92b..ba22eff73abab11abeb57283c63318b2e50a9ca1 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -132,7 +132,7 @@ xla::StatusOr<xla::XlaOp> XlaScatter(
     // Discard updates with negative indices, since some users expect this.
     auto index_in_range = xla::ReduceAll(
         xla::Le(zero_index, index), xla::ConstantR0<bool>(body_builder, true),
-        xla::CreateScalarAndComputation(body_builder));
+        xla::CreateScalarAndComputation(xla::PRED, body_builder));
 
     // Make the index in bounds to prevent implementation defined behavior.
     index = xla::Max(index, zero_index);
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 87309e10ede320a81d173cd0a64492f88a2c7376..13a5f1b850a612bddeeac39bef431c19925351ca 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index e405f8dfaa9676400c9a8550bb31739a51cad561..04fa10108cef66f429392951eea70e59643a2d29 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -22,16 +22,315 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/math/math_util.h"
 
 namespace tensorflow {
 
+// Get the diagonal blocks of the coefficient matrix
+xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
+  xla::XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a));
+    int ndims = xla::ShapeUtil::Rank(shape);
+    int64 n = xla::ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = n / block_size;
+
+    xla::XlaOp diag_blocks;
+
+    // If the coefficient matrix is exactly the block size, we just add a
+    // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
+    if (n == block_size) {
+      std::vector<int64> permutation(ndims);
+      std::iota(permutation.begin(), permutation.end(), 1);
+      permutation.insert(permutation.end() - 2, 0);
+      return Transpose(Broadcast(a, /*broadcast_sizes=*/{1}), permutation);
+    }
+
+    // We can grab entire blocks using gather
+    if (n > block_size) {
+      // Construct the starting indices of the diagonal blocks
+      auto gather_indices =
+          Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
+                                  xla::ConstantR0<int32>(builder, block_size)),
+                              /*broadcast_sizes=*/{2}),
+                    /*permutation=*/{1, 0});
+
+      // Gather the diagonal blocks
+      xla::GatherDimensionNumbers dim_numbers;
+      dim_numbers.add_output_window_dims(ndims - 1);
+      dim_numbers.add_output_window_dims(ndims);
+      dim_numbers.add_gather_dims_to_operand_dims(ndims - 2);
+      dim_numbers.add_gather_dims_to_operand_dims(ndims - 1);
+      dim_numbers.set_index_vector_dim(1);
+      diag_blocks = Gather(a, gather_indices, dim_numbers,
+                           /*window_bounds=*/{block_size, block_size});
+    }
+
+    // The last block might be smaller than the block size,
+    // so we will need to pad it
+    if (n % block_size != 0) {
+      // Pad with zeros
+      auto last_blocks =
+          SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
+      xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims);
+      int64 padding = block_size - n % block_size;
+      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
+      config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
+      last_blocks =
+          Pad(last_blocks, Zero(builder, shape.element_type()), config);
+
+      // Add a singleton dimension
+      // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
+      TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
+                          builder->GetShape(last_blocks));
+      auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions());
+      auto last_blocks_dims = std::vector<int64>(ndims);
+      std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
+      last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
+      last_blocks = Reshape(last_blocks, last_blocks_dims);
+
+      // Concatenate with the other blocks if necessary
+      if (n > block_size) {
+        diag_blocks =
+            xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
+      } else {
+        diag_blocks = last_blocks;
+      }
+    }
+
+    return diag_blocks;
+  });
+}
+
+xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
+                                bool transpose_a, bool conjugate_a) {
+  xla::XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    // Input is a batch of square lower triangular square matrices. Its shape is
+    // (..., size, size). We resize this to (num_blocks, size, size).
+    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = xla::ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) /
+                       tensorflow::MathUtil::IPow(block_size, 2);
+    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
+
+    // The input must be triangular because we rely on that when doing
+    // multiplications later on
+    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
+
+    // Rescale blocks to be unit triangular, but avoid dividing by
+    // zero (which can happen if the last block was padded) otherwise it will
+    // introduce nans which will propagate
+    auto diags = GetMatrixDiagonal(diag_blocks);
+    TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags));
+    auto one = ScalarLike(diags, 1);
+    auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions()));
+    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
+    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
+
+    // We can now use the fact that for an upper triangular matrix
+    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
+    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
+    // have been rescaled to be unit triangular, so L22 = L22' = 1.
+
+    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
+    // of 1 because we cannot do matrix-vector multiplies with variable shapes
+    // inside of a loop, or do irregularly shaped in-place updates. Hence,
+    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
+    // entire row i.e. we calculate
+    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
+    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
+    auto identity =
+        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
+    auto neg_identity = -identity;
+
+    // The first or last  diagonal element should be set to 1 instead of -1
+    // though, since we never update it
+    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
+    auto start_index = (lower) ? 0 : block_size - 1;
+    auto output_block = DynamicUpdateSlice(
+        neg_identity, pos_one,
+        /*start_indices=*/xla::ConstantR1<int>(builder, 2, start_index));
+
+    // Broadcast diag([1, -1, -1, ...]) to every block
+    xla::XlaOp output = Broadcast(output_block,
+                                  /*broadcast_sizes=*/{num_blocks});
+
+    // Now we construct a loop that performs matrix-vector multiplications
+    // inverting the blocks one row at a time
+    std::vector<xla::Shape> tuple_shapes = {
+        // The loop iteration counter is a scalar, incremented each iteration.
+        xla::ShapeUtil::MakeShape(xla::S32, {}),
+        // The output has the shape of A, with one row updated each iteration.
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  {num_blocks, block_size, block_size}),
+        // The input is a loop invariant.
+        xla::ShapeUtil::MakeShape(shape.element_type(),
+                                  {num_blocks, block_size, block_size})};
+    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+
+    auto init_i = One(builder, xla::S32);
+    auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks});
+
+    // Construct the loop condition function.
+    std::unique_ptr<xla::XlaBuilder> condb =
+        builder->CreateSubBuilder("InvertDiagCond");
+    {
+      auto i = GetTupleElement(
+          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
+      Lt(i, xla::ConstantR0<int32>(condb.get(), block_size));
+    }
+    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+    // Construct the loop body function.
+    std::unique_ptr<xla::XlaBuilder> bodyb =
+        builder->CreateSubBuilder("InvertDiagBody");
+    {
+      auto input_tuple =
+          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
+
+      auto i = GetTupleElement(input_tuple, 0);
+      auto body_out = GetTupleElement(input_tuple, 1);
+      auto body_input = GetTupleElement(input_tuple, 2);
+
+      auto zero = xla::ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
+      auto start_indices =
+          xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
+      auto input_row =
+          DynamicSlice(body_input, start_indices,
+                       /*slice_sizes=*/{num_blocks, 1, block_size});
+
+      // We want -L21 L11^{-1}
+      xla::DotDimensionNumbers dnums;
+      dnums.add_lhs_batch_dimensions(0);
+      dnums.add_rhs_batch_dimensions(0);
+      dnums.add_lhs_contracting_dimensions(2);
+      dnums.add_rhs_contracting_dimensions(1);
+      auto update = -DotGeneral(input_row, body_out, dnums);
+
+      body_out = DynamicUpdateSlice(body_out, update, start_indices);
+
+      auto next_i = i + ScalarLike(i, 1);
+      xla::Tuple(bodyb.get(), {next_i, body_out, body_input});
+    }
+    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+    // Construct the While loop and return the result,
+    // return while_loop(cond_fun, body_fun, init)[1]
+    auto invert_while = While(cond, body, init);
+    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
+
+    // Undo the scaling
+    inv_diag_blocks = Div(inv_diag_blocks, diags,
+                          /*broadcast_dimensions=*/{0, 1});
+
+    // Reshape back to original batch major dimensions
+    return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions()));
+  });
+}
+
+xla::XlaOp SolveWithInvertedDiagonalBlocks(xla::XlaOp a, xla::XlaOp b,
+                                           xla::XlaOp inv_diag_blocks,
+                                           bool left_side, bool lower,
+                                           bool transpose_a, bool conjugate_a) {
+  xla::XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+    TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
+                        builder->GetShape(inv_diag_blocks));
+    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
+    int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1);
+
+    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+    int64 ndims = xla::ShapeUtil::Rank(a_shape);
+    int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+    int64 num_blocks = n / block_size + (n % block_size != 0);
+    int64 m_dim = (left_side) ? -1 : -2;
+    int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim);
+
+    // Initialize the solution
+    auto x = ZerosLike(b);
+
+    // This loop is unrolled for performance reasons, but it could be expressed
+    // rolled as well since the matrices are of the same size each iteration
+    for (int i = 0; i < num_blocks; i++) {
+      // High-level intuition: We have B[i] = L[i] @ X. Since L is upper
+      // triangular this means B[i] = L[i, :i + 1] @ X[:i + 1]. We can split
+      // this into two parts: B[i] = L[i, :i] @ X[:i] + L[i, i] @ X[i] which
+      // can be solved for X[i] as X[i] = inv(L[i, i]) @ B[i] - L[i, :i] @ X[:i]
+
+      // Decide whether we go from first block to last or vice versa
+      auto j = (left_side ^ lower ^ transpose_a) ? num_blocks - 1 - i : i;
+
+      // Get the size of the inverse blocks (the last one might be smaller)
+      int64 block = (n % block_size != 0 && j + 1 == num_blocks)
+                        ? n % block_size
+                        : block_size;
+      auto inv_block =
+          MaybeConjugate(Collapse(SliceInMinorDims(inv_diag_blocks, {j, 0, 0},
+                                                   {j + 1, block, block}),
+                                  /*dimensions=*/{ndims - 2, ndims - 1}),
+                         conjugate_a);
+
+      // Get the corresponding row of B
+      int64 k = std::min((j + 1) * block_size, n);
+      std::vector<int64> start = {j * block_size, 0};
+      std::vector<int64> end = {k, m};
+      if (!left_side) {
+        std::swap(start[0], start[1]);
+        std::swap(end[0], end[1]);
+      }
+      auto b_row = SliceInMinorDims(b, start, end);
+
+      xla::XlaOp remainder;
+      if (i == 0) {
+        remainder = b_row;
+      } else {
+        // This matrix multiply involves a lot of multiplying with zero (namely,
+        // X[i * block_size:] = 0), but this is faster than slicing...
+        end = {k, n};
+        if (!left_side) {
+          std::swap(end[0], end[1]);
+        }
+        if (transpose_a) {
+          std::swap(start[0], start[1]);
+          std::swap(end[0], end[1]);
+        }
+        auto a_row =
+            MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
+        if (left_side) {
+          remainder = b_row - BatchDot(a_row, x, transpose_a, false);
+        } else {
+          remainder = b_row - BatchDot(x, a_row, false, transpose_a);
+        }
+      }
+
+      xla::XlaOp x_update;
+      auto zero = Zero(builder, xla::S32);
+      auto start_index =
+          xla::ConstantR0WithType(builder, xla::S32, j * block_size);
+      std::vector<xla::XlaOp> update_starts = {start_index, zero};
+      if (left_side) {
+        x_update = BatchDot(inv_block, remainder, transpose_a, false);
+      } else {
+        x_update = BatchDot(remainder, inv_block, false, transpose_a);
+        std::swap(update_starts[0], update_starts[1]);
+      }
+      x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
+    }
+
+    return x;
+  });
+}
+
 xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
                            int64 block_size) {
@@ -45,7 +344,7 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
           xla::ShapeUtil::HumanString(a_shape), " vs. ",
           xla::ShapeUtil::HumanString(b_shape));
     }
-    const int ndims = xla::ShapeUtil::Rank(a_shape);
+    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
     if (ndims < 2) {
       return errors::InvalidArgument(
           "Arguments to TriangularSolve must have rank >= 2: ", ndims);
@@ -85,528 +384,18 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
           block_size);
     }
 
-    std::map<int, xla::XlaComputation> base_computations;
-    auto get_base_triangular_solve =
-        [&](int k) -> xla::StatusOr<xla::XlaComputation*> {
-      xla::XlaComputation& computation = base_computations[k];
-      if (computation.IsNull()) {
-        std::unique_ptr<xla::XlaBuilder> sub = builder->CreateSubBuilder(
-            tensorflow::strings::StrCat("trsm_base_", k));
-
-        auto a_param = xla::Parameter(
-            sub.get(), 0,
-            xla::ShapeUtil::MakeShape(b_shape.element_type(),
-                                      ConcatVectors(batch_dimensions, {k, k})),
-            "a");
-
-        std::array<int64, 2> b_lastd;
-        if (left_side) {
-          b_lastd = {k, n};
-        } else {
-          b_lastd = {m, k};
-        }
-        auto b_param = xla::Parameter(
-            sub.get(), 1,
-            xla::ShapeUtil::MakeShape(b_shape.element_type(),
-                                      ConcatVectors(batch_dimensions, b_lastd)),
-            "b");
-
-        // We use a left-looking or right-looking subroutine on the block
-        // diagonal in the lower=true cases, while falling back to a recursive
-        // call in others. The left-looking and right-looking subroutines are
-        // written with a While loop and so yields much faster compile times.
-        // Moreover, they can give higher performance on smaller (sub)problems.
-        if (left_side && lower) {
-          TriangularSolveLeftLooking(a_param, b_param, transpose_a,
-                                     conjugate_a);
-        } else if (!left_side && lower) {
-          TriangularSolveRightLooking(a_param, b_param, transpose_a,
-                                      conjugate_a);
-        } else {
-          TriangularSolve(a_param, b_param, left_side, lower, transpose_a,
-                          conjugate_a,
-                          /*block_size=*/1);
-        }
-
-        TF_ASSIGN_OR_RETURN(computation, sub->Build());
-      }
-      return &computation;
-    };
-
-    xla::XlaOp output = xla::ZerosLike(b);
-
-    // Right-looking blocked triangular solve.
-    // For an explanation of the algorithm, see the TRSM discussion in:
-    // Goto, Kazushige, and Robert Van De Geijn. "High-performance
-    // implementation of the level-3 BLAS." ACM Transactions on Mathematical
-    // Software (TOMS) 35.1 (2008): 4.
-
-    // In the code comments below, T = lambda x: np.swapaxes(x, -1, -2) if
-    // conjugate_a is False, or T = lambda x: np.conj(np.swapaxes(x, -1, -2)) if
-    // conjugate_a is True.
-
-    if (!left_side && lower == transpose_a) {
-      // for i in range(0, a.shape[-1], block_size):
-      for (int64 i = 0; i < n; i += block_size) {
-        int64 k = std::min(block_size, n - i);
-
-        // output[..., :, i:i+k] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., :, i:i+k] - np.matmul(output[..., :, :i],
-        //                                  a[..., :i, i:i+k]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {0, i}, {m, i + k});
-
-        // Note that we multiply with the full output, since this is faster
-        // than slicing, and output[..., :, i:] = 0
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, n});
-        } else {
-          a_prev = SliceInMinorDims(a, {0, i}, {n, i + k});
-        }
-        auto prev_contribution = BatchDot(output, a_prev,
-                                          /*transpose_x=*/false,
-                                          /*transpose_y=*/transpose_a,
-                                          /*conjugate_x=*/false,
-                                          /*conjugate_y=*/conjugate_a);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {0, i});
-      }
-
-    } else if (left_side && lower != transpose_a) {
-      // for i in range(0, a.shape[-1], block_size):
-      for (int64 i = 0; i < m; i += block_size) {
-        int64 k = std::min(block_size, m - i);
-
-        // output[..., i:i+k, :] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., i:i+k, :] - np.matmul(a[..., i:i+k, :i],
-        //                                  output[..., :i, :]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {i, 0}, {i + k, n});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, m});
-        } else {
-          a_prev = SliceInMinorDims(a, {0, i}, {m, i + k});
-        }
-        auto prev_contribution = BatchDot(a_prev, output,
-                                          /*transpose_x=*/transpose_a,
-                                          /*transpose_y=*/false,
-                                          /*conjugate_x=*/conjugate_a,
-                                          /*conjugate_y=*/false);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {i, 0});
-      }
-    } else if (!left_side && lower != transpose_a) {
-      // for i in reversed(range(0, a.shape[-1], block_size)):
-      const int64 last_blk_ix =
-          xla::RoundUpToNearest(n, block_size) - block_size;
-      for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
-        int64 k = std::min(block_size, n - i);
-
-        // output[..., :, i:i+k] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., :, i:i+k] - np.matmul(output[..., :, :i],
-        //                                  a[..., :i, i:i+k]),\
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {0, i}, {m, i + k});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {0, i}, {n, i + k});
-        } else {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, n});
-        }
-        auto prev_contribution = BatchDot(output, a_prev,
-                                          /*transpose_x=*/false,
-                                          /*transpose_y=*/transpose_a,
-                                          /*conjugate_x=*/false,
-                                          /*conjugate_y=*/conjugate_a);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {0, i});
-      }
-    } else {  // left_side && lower == transpose_a
-      // for i in reversed(range(0, a.shape[-1], block_size)):
-      const int64 last_blk_ix =
-          xla::RoundUpToNearest(m, block_size) - block_size;
-      for (int64 i = last_blk_ix; i >= 0; i -= block_size) {
-        int64 k = std::min(block_size, m - i);
-
-        // output[..., i:i+k, :] = triangular_solve(
-        //     a[..., i:i+k, i:i+k],
-        //     b[..., i:i+k, :] - np.matmul(a[..., i:i+k, :i],
-        //                                  output[..., :i, :]),
-        //     ..., block_size=1)
-        auto a_slice = SliceInMinorDims(a, {i, i}, {i + k, i + k});
-        auto b_slice = SliceInMinorDims(b, {i, 0}, {i + k, n});
-
-        xla::XlaOp a_prev;
-        if (lower) {
-          a_prev = SliceInMinorDims(a, {0, i}, {m, i + k});
-        } else {
-          a_prev = SliceInMinorDims(a, {i, 0}, {i + k, m});
-        }
-        auto prev_contribution = BatchDot(a_prev, output,
-                                          /*transpose_x=*/transpose_a,
-                                          /*transpose_y=*/false,
-                                          /*conjugate_x=*/conjugate_a,
-                                          /*conjugate_y=*/false);
-        auto to_solve = b_slice - prev_contribution;
-
-        xla::XlaOp update;
-        if (k > 1) {
-          TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
-                              get_base_triangular_solve(k));
-          update = xla::Call(builder, *solve, {a_slice, to_solve});
-        } else {
-          auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-          update = to_solve / a_slice_conj;
-        }
-        output = UpdateSliceInMinorDims(output, update, {i, 0});
-      }
-    }
-
-    return output;
-  });
-}
-
-xla::XlaOp TriangularSolveLeftLooking(xla::XlaOp a, xla::XlaOp b,
-                                      bool transpose_a, bool conjugate_a) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
-
-    std::vector<int64> batch_dimensions;
-    int64 num_batches = 1;
-    for (int i = 0; i < ndims - 2; ++i) {
-      int64 a_size = a_shape.dimensions(i);
-      batch_dimensions.push_back(a_size);
-      num_batches = num_batches * a_size;
-    }
-
-    // Rescale the input to be unit triangular
-    auto diag = Diagonal(a);
-    xla::XlaOp scaled_a;
-    std::vector<int64> broadcast_dimensions(ndims - 1);
-    std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(), 0);
-    if (transpose_a) {
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    } else {
-      // Broadcast over the rows
-      broadcast_dimensions[ndims - 2] = ndims - 1;
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    }
-
-    // The main computation is performed in a While loop.
-
-    // Allocate the output and set its first or last row,
-    // output = np.zeros_like(b)
-    // if transpose_a:
-    //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
-    // else:
-    //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
-    xla::XlaOp output = xla::ZerosLike(b);
-    {
-      auto i = transpose_a ? m - 1 : 0;
-      auto a_slice = SliceInMinorDims(scaled_a, {i, i}, {i + 1, i + 1});
-      auto b_slice = SliceInMinorDims(b, {i, 0}, {i + 1, n});
-      auto a_slice_conj = MaybeConjugate(a_slice, conjugate_a);
-      auto update = b_slice / a_slice_conj;
-      output = UpdateSliceInMinorDims(output, update, {i, 0});
-    }
-
-    // Construct the initial loop carry tuple,
-    // if transpose_a:
-    //   init = (m-2, output, a, b)
-    // else:
-    //   init = (1, output, a, b)
-    std::vector<xla::Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
-        // The output has the shape of b, with one row updated each iteration.
-        b_shape,
-        // The coefficient matrix a is a loop invariant.
-        a_shape,
-        // The right-hand-side matrix b is a loop invariant.
-        b_shape};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-    auto init_i = xla::ConstantR0<int32>(builder, transpose_a ? m - 2 : 1);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_a, b});
-
-    // Construct the loop condition function,
-    // def cond_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   return i >= 0 if transpose_a else i < m
-    std::unique_ptr<xla::XlaBuilder> condb =
-        builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
-    {
-      auto i = xla::GetTupleElement(
-          xla::Parameter(condb.get(), 0, tuple_shape,
-                         "TriangularSolveLeftLookingWhileTuple"),
-          0);
-      if (transpose_a) {
-        xla::Ge(i, xla::ConstantR0<int32>(condb.get(), 0));
-      } else {
-        xla::Lt(i, xla::ConstantR0<int32>(condb.get(), m));
-      }
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function,
-    // def body_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   if transpose_a:
-    //     a_row = np.swapaxes(a[..., i+1:, i:i+1], -1 -2)
-    //   else:
-    //     a_row = a[..., i:i+1, :i]
-    //   result_row = b[..., i:i+1, :] - np.matmul(a_row, output[..., :, :])
-    //   output[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
-    //   if transpose_a:
-    //     return (i - 1, output, a, b)
-    //   else:
-    //     return (i + 1, output, a, b)
-    // We have to do some extra FLOPs propagating zeros in the matrix multiply
-    // because we can't have the size of its arguments depend on the loop
-    // counter.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
-        builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
-    {
-      auto input_tuple = xla::Parameter(bodyb.get(), 0, tuple_shape,
-                                        "TriangularSolveLeftLookingWhileTuple");
-
-      // i, output, a, b = loop_carry
-      auto i = xla::GetTupleElement(input_tuple, 0);
-      auto body_out = xla::GetTupleElement(input_tuple, 1);
-      auto body_a = xla::GetTupleElement(input_tuple, 2);
-      auto body_b = xla::GetTupleElement(input_tuple, 3);
-      auto zero = xla::ConstantR0<int32>(bodyb.get(), 0);
-
-      // We'd like to implement this:
-      //   if transpose_a:
-      //     a_row = T(a[..., i+1:, i:i+1])
-      //     result_row = (b[..., i:i+1, :]
-      //                   - np.matmul(a_row, body_out[..., i+1:, :]))
-      //   else:
-      //     result_row = (b[..., i:i+1, :]
-      //                   - np.matmul(a[..., i:i+1, :i], body_out[..., :i, :]))
-      // But since we can't have intermediate array sizes depend on the loop
-      // counter, we instead exploit the fact that we initialized the output to
-      // all zeros and use that as zero-padding (doing unnecessary FLOPs).
-      xla::XlaOp a_row;
-      if (transpose_a) {
-        a_row = DynamicSliceInMinorDims(body_a, {zero, i}, {m, 1});
-      } else {
-        a_row = DynamicSliceInMinorDims(body_a, {i, zero}, {1, m});
-      }
-      auto b_update = BatchDot(a_row, body_out,
-                               /*transpose_x=*/transpose_a,
-                               /*transpose_y=*/false,
-                               /*conjugate_x=*/conjugate_a,
-                               /*conjugate_y=*/false);
-      auto result_row_slice =
-          DynamicSliceInMinorDims(body_b, {i, zero}, {1, n});
-      auto result_row = result_row_slice - b_update;
-
-      // body_out[..., i:i+1, :] = result_row
-      body_out = DynamicUpdateSliceInMinorDims(body_out, result_row, {i, zero});
-
-      // if transpose_a:
-      //   return (i - 1, body_out, a, b)
-      // else:
-      //   return (i + 1, body_out, a, b)
-      auto next_i = xla::Add(
-          i, xla::ConstantR0<int32>(bodyb.get(), transpose_a ? -1 : 1));
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_a, body_b});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto triangular_solve_left_looking_while = xla::While(cond, body, init);
-    output = xla::GetTupleElement(triangular_solve_left_looking_while, 1);
-    auto scaling = MaybeConjugate(diag, conjugate_a);
-    // Broadcast over the columns
-    broadcast_dimensions[ndims - 2] = ndims - 2;
-    return Div(output, scaling, broadcast_dimensions);
-  });
-}
-
-xla::XlaOp TriangularSolveRightLooking(xla::XlaOp a, xla::XlaOp b,
-                                       bool transpose_a, bool conjugate_a) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
-
-    std::vector<int64> batch_dimensions;
-    int64 num_batches = 1;
-    for (int i = 0; i < ndims - 2; ++i) {
-      int64 a_size = a_shape.dimensions(i);
-      batch_dimensions.push_back(a_size);
-      num_batches = num_batches * a_size;
-    }
+    // We find the diagonal blocks of the coefficient matrix
+    auto diag_blocks = DiagonalBlocks(a, block_size);
 
-    // Rescale the input to be unit triangular
-    auto diag = Diagonal(a);
-    xla::XlaOp scaled_a;
-    std::vector<int64> broadcast_dimensions(ndims - 1);
-    std::iota(broadcast_dimensions.begin(), broadcast_dimensions.end(), 0);
-    if (transpose_a) {
-      // Broadcast over the rows
-      broadcast_dimensions[ndims - 2] = ndims - 1;
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    } else {
-      scaled_a = Div(a, diag, broadcast_dimensions);
-    }
+    // We invert these blocks in parallel using batched matrix-vector products
+    auto inv_diag_blocks =
+        InvertDiagonalBlocks(diag_blocks, lower, transpose_a, conjugate_a);
 
-    // The main computation is performed in a While loop.
-    xla::XlaOp output = xla::ZerosLike(b);
+    // We now find the solution using GEMMs
+    auto x = SolveWithInvertedDiagonalBlocks(a, b, inv_diag_blocks, left_side,
+                                             lower, transpose_a, conjugate_a);
 
-    // Construct the initial loop carry tuple,
-    // if transpose_a:
-    //   init = (0, output, a, b)
-    // else:
-    //   init = (n-1, output, a, b)
-    std::vector<xla::Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
-        // The output has the shape of b, with one row updated each iteration.
-        b_shape,
-        // The coefficient matrix a is a loop invariant.
-        a_shape,
-        // The right-hand-side matrix b is a loop invariant.
-        b_shape};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
-    auto init_i = xla::ConstantR0<int32>(builder, transpose_a ? 0 : n - 1);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_a, b});
-
-    // Construct the loop condition function,
-    // def cond_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   return i < n if transpose_a else i >= 0
-    std::unique_ptr<xla::XlaBuilder> condb =
-        builder->CreateSubBuilder("TriangularSolveRightLookingWhileCond");
-    {
-      auto i = xla::GetTupleElement(
-          xla::Parameter(condb.get(), 0, tuple_shape,
-                         "TriangularSolveRightLookingWhileTuple"),
-          0);
-      if (transpose_a) {
-        xla::Lt(i, xla::ConstantR0<int32>(condb.get(), n));
-      } else {
-        xla::Ge(i, xla::ConstantR0<int32>(condb.get(), 0));
-      }
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function,
-    // def body_fun(loop_carry):
-    //   i, output, a, b = loop_carry
-    //   if transpose_a:
-    //     a_row = np.swapaxes(a[..., :, i:i+1], -1, -2)
-    //   else:
-    //     a_row = a[..., :, i:i+1]
-    //   result_row = b[..., :, i:i+1] - np.matmul(output, a_row)
-    //   output[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1]
-    //   if transpose_a:
-    //     return (i - 1, output, a, b)
-    //   else:
-    //     return (i + 1, output, a, b)
-    // We have to do some extra FLOPs propagating zeros in the matrix multiply
-    // because we can't have the size of its arguments depend on the loop
-    // counter.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
-        builder->CreateSubBuilder("TriangularSolveRightLookingWhileBody");
-    {
-      auto input_tuple = xla::Parameter(
-          bodyb.get(), 0, tuple_shape, "TriangularSolveRightLookingWhileTuple");
-
-      // i, output, a, b = loop_carry
-      auto i = xla::GetTupleElement(input_tuple, 0);
-      auto body_out = xla::GetTupleElement(input_tuple, 1);
-      auto body_a = xla::GetTupleElement(input_tuple, 2);
-      auto body_b = xla::GetTupleElement(input_tuple, 3);
-      auto zero = xla::ConstantR0<int32>(bodyb.get(), 0);
-
-      // result = b - np.matmul(output, a)
-      // result_row = result[..., :, i:i+1]
-      auto body_b_slice = DynamicSliceInMinorDims(body_b, {zero, i}, {m, 1});
-      xla::XlaOp a_slice;
-      if (transpose_a) {
-        a_slice = DynamicSliceInMinorDims(body_a, {i, zero}, {1, n});
-      } else {
-        a_slice = DynamicSliceInMinorDims(body_a, {zero, i}, {n, 1});
-      }
-      auto b_update = body_b_slice - BatchDot(body_out, a_slice,
-                                              /*transpose_x=*/false,
-                                              /*transpose_y=*/transpose_a,
-                                              /*conjugate_x=*/false,
-                                              /*conjugate_y=*/conjugate_a);
-
-      // body_out[..., :, i:i+1] = b_update
-      body_out = DynamicUpdateSliceInMinorDims(body_out, b_update, {zero, i});
-
-      // if transpose_a:
-      //   return (i + 1, body_out, a, b)
-      // else:
-      //   return (i - 1, body_out, a, b)
-      auto next_i = xla::Add(
-          i, xla::ConstantR0<int32>(bodyb.get(), transpose_a ? 1 : -1));
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_a, body_b});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto triangular_solve_left_looking_while = xla::While(cond, body, init);
-    output = xla::GetTupleElement(triangular_solve_left_looking_while, 1);
-    auto scaling = MaybeConjugate(diag, conjugate_a);
-    // Broadcast over the rows
-    broadcast_dimensions[ndims - 2] = ndims - 1;
-    return Div(output, scaling, broadcast_dimensions);
+    return x;
   });
 }
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index 7eb9238014632dbb325f6ae5db2eaab9732db3bd..555760b7efabddfb25c9135b109a1c48b487415e 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 namespace tensorflow {
 
@@ -61,12 +61,6 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
                            bool lower, bool transpose_a, bool conjugate_a,
                            int64 block_size = 128);
 
-xla::XlaOp TriangularSolveLeftLooking(xla::XlaOp a, xla::XlaOp b,
-                                      bool transpose_a, bool conjugate_a);
-
-xla::XlaOp TriangularSolveRightLooking(xla::XlaOp a, xla::XlaOp b,
-                                       bool transpose_a, bool conjugate_a);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index f1bff6037bfa436e98a8dc7dbf6293b10b8d736f..aeebf16028d40189203cdfd815f06a339ee72902 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -207,6 +207,28 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
                              xla::ErrorSpec(1e-2, 1e-2));
 }
 
+XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotransposeIrregularblock) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::XlaOp a, b;
+  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
+  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
+  TriangularSolve(a, b,
+                  /*left_side=*/true, /*lower=*/true,
+                  /*transpose_a=*/false, /*conjugate_a=*/false,
+                  /*block_size=*/3);
+
+  xla::Array2D<float> expected({
+      {0.5, 1.0, 1.5},
+      {0.41666667, 0.33333333, 0.25},
+      {0.23148148, 0.18518519, 0.13888889},
+      {0.16835017, 0.13468013, 0.1010101},
+  });
+
+  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
+                             xla::ErrorSpec(1e-2, 1e-2));
+}
+
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
   xla::XlaBuilder builder(TestName());
 
@@ -307,47 +329,5 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
                                  xla::ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolveLeftLooking(a, b,
-                             /*transpose_a=*/false,
-                             /*conjugate_a=*/false);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
-XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
-  xla::XlaBuilder builder(TestName());
-
-  xla::XlaOp a, b;
-  auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
-  auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
-  TriangularSolveLeftLooking(a, b,
-                             /*transpose_a=*/false,
-                             /*conjugate_a=*/false);
-
-  xla::Array2D<float> expected({
-      {0.5, 1.0, 1.5},
-      {0.41666667, 0.33333333, 0.25},
-      {0.23148148, 0.18518519, 0.13888889},
-      {0.16835017, 0.13468013, 0.1010101},
-  });
-
-  ComputeAndCompareR2<float>(&builder, expected, {a_data.get(), b_data.get()},
-                             xla::ErrorSpec(1e-2, 1e-2));
-}
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index a6f5d346cb5ecb85ff6b2306c2502ba31d74cc64..8b5beba383cda45d36e2ee27ca5e3b3c5988b6b7 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 6cb6c088e9d20af05193f0a3da6c2595966eb495..b4905c952820a45371e090aa98466654e2db9661 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 574e70ddeeab8a3041cd730ce2717daec4f82ddf..d64394f1401d7ceea004a59c991ef6f4a1c58b41 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/tf2xla/lib/while_loop.h
index 5b6684c995889efbb1378c7ac4903548891d090a..9493b1f109be0725f7f733b9f9da664264275a69 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.h
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index ac768b206e2a8d163a4253432a1911152f89ce86..48568c825b7a0f13011d3d6e8e62ec5db026760f 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index d02fc56c5b8f58f0e4cfe1779ad34fe3b79324c7..432a12a51622b56ae74a677420da321c58960ee6 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index f0b30dcf4e98faa8ab0801a8b0583744bdc669c7..56f7045a98201ed398244f9e3f5ff23788135b75 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index fe7ec633eca2504faf6cbb2f5fd7f59780ab7976..e89f4733281194f0263ae8cc4907caa0ad781165 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index d0b9e34e162f3412cd6662a2e2bbfe3df213c4c2..a6e78825334fec748be5fee80669649df699d2fb 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index cb47581e36bc302ec2789ffbbd962d9dccc368e5..226c89bcf1e66b5afb43cddb03db39b931ca55a8 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -28,12 +28,14 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -688,12 +690,12 @@ Status ValidateFunctionDef(const FunctionDef* fdef,
 Status ValidateGraph(const Graph* graph,
                      const FunctionLibraryDefinition& flib_def,
                      const DeviceType& device_type, const string& name) {
-  auto maybe_error = [&](const string& op, const Status& s) -> Status {
+  auto maybe_error = [&](const Node* node, const Status& s) -> Status {
     if (!s.ok()) {
       return errors::InvalidArgument(strings::StrCat(
           "Detected unsupported operations when trying to compile graph ", name,
-          " on ", device_type.type_string(), ": ", op, " (", s.error_message(),
-          ")"));
+          " on ", device_type.type_string(), ": ", node->def().op(), " (",
+          s.error_message(), ")", FormatNodeForError(*node)));
     }
     return Status::OK();
   };
@@ -706,15 +708,15 @@ Status ValidateGraph(const Graph* graph,
     Status s;
     if (fdef) {
       s = ValidateFunctionDef(fdef, flib_def);
-      TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+      TF_RETURN_IF_ERROR(maybe_error(node, s));
       continue;
     }
     const OpDef* op_def;
     s = OpRegistry::Global()->LookUpOpDef(node->def().op(), &op_def);
-    TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+    TF_RETURN_IF_ERROR(maybe_error(node, s));
     TF_RETURN_IF_ERROR(ValidateNodeDef(node->def(), *op_def));
     s = FindKernelDef(device_type, node->def(), nullptr, nullptr);
-    TF_RETURN_IF_ERROR(maybe_error(node->def().op(), s));
+    TF_RETURN_IF_ERROR(maybe_error(node, s));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 079c99797e1f1ec26205e33b3c7c16d3764f15ca..acc64d99d3e7f0be76aada5ac4042787a5f4b0f6 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 2fb93be01d6bf4dca22b74f64c1d6c8b0d7f6fb5..be00ed8813fdf2778d6af81556001ef51538dd34 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -312,7 +312,7 @@ TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
       str_util::StrContains(status.error_message(), "depends on a parameter"))
       << status.error_message();
   EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "[[Node: C = Reshape"))
+      str_util::StrContains(status.error_message(), "[[{{node C}} = Reshape"))
       << status.error_message();
 }
 
@@ -1077,6 +1077,8 @@ TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(status.error_message(), "InvalidOp"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node fill_fn}}"))
+      << status.error_message();
 }
 
 // Tests a graph which has a node with invalid data type.
@@ -1101,6 +1103,8 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "is not in the list of allowed values"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Shape}}"))
+      << status.error_message();
 }
 
 TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
@@ -1122,9 +1126,10 @@ TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
     status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp",
                                    std::move(graph_copy), args, &result);
     ASSERT_FALSE(status.ok());
-    EXPECT_TRUE(str_util::StrContains(status.error_message(),
-                                      "The following nodes are unreachable "
-                                      "from the source in the graph: NoOp"))
+    EXPECT_TRUE(
+        str_util::StrContains(status.error_message(),
+                              "The following nodes are unreachable "
+                              "from the source in the graph: {{node NoOp}}"))
         << status.error_message();
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 0dea366476954123ec09c020d493061fa2637c2f..b24e3aabbe6ba858a8bfb4dd435726984cc7b0f5 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -25,7 +25,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 38d8cd653cbbe5b01325d6b478589d88909bac56..3db37afdba71342cfb20af8841a40cb54709ca73 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
index dc98d4fda6ae21411065981a7b7383ef0ad50f44..24c812297a69986d16758f02d2d31350d6744516 100644
--- a/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
+++ b/tensorflow/compiler/tf2xla/xla_gpu_backend.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/legacy_flags/backend_registration_flags.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
@@ -21,16 +20,6 @@ limitations under the License.
 namespace tensorflow {
 
 bool GpuOpFilter(KernelDef* kdef) {
-  // TODO(b/31361304): The GPU backend does not parallelize PRNG ops, leading to
-  // slow code.
-  legacy_flags::BackendRegistrationFlags* flags =
-      legacy_flags::GetBackendRegistrationFlags();
-  VLOG(2) << "flags->tf_enable_prng_ops_gpu: " << flags->tf_enable_prng_ops_gpu;
-  if (!flags->tf_enable_prng_ops_gpu &&
-      (kdef->op() == "RandomStandardNormal" || kdef->op() == "RandomUniform" ||
-       kdef->op() == "RandomUniformInt" || kdef->op() == "TruncatedNormal")) {
-    return false;
-  }
   // TODO(b/26783907): The GPU backend currently does not implement sort.
   if (kdef->op() == "XlaSort" || kdef->op() == "TopKV2") {
     return false;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 4d1b3b1a135c493b3c2fa95967d2c4768ecfa63f..8efb3d55c88757b9366bdf9622287bdd0a72e295 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index d6ca4ab9346593892917e8375b07a8790dc26e79..e6522157a535fc3e4ec96cb0496b6be2e525c336 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 9e17756b27733e2453ea1688d13e1d718c25cfc8..00ccfb1c7873c85564b1bf4cf582cd31baa17ad5 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index e8eafb3819f420bf4fd5be2c3930b3da75be58d6..82028c8b9ca9f65a73f8b50edc0a47c7068aba9a 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 6203cffd806a94743e2df7adc1e52d0255ab7c06..ac9dfe3369078df7392a4ef04679f7d7beacf8bb 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -17,7 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index baea8149658ec0849ebb570931ca68518ec5284e..7928fa034725206a752cbfe086d01f15cd235df9 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 4de18a77887496d30e3b1407ecd9042e619653af..2438490be13809b9f3571a362900b44cb838e76b 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index f1c383fd9e3fff8a306ba0ddcc3f9ee42c63d66a..fdf13bb18c2567d2994612d15119ae87cbfa9137 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -636,7 +636,7 @@ cc_library(
         ":window_util",
         ":xla_data_proto",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_evaluator",
         "//tensorflow/compiler/xla/service:shape_inference",
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 25666cad40e8f73812593635ccd0ef56fcd9b955..ad3fcee05b80181369bfdf3cdcdb5452ec9e7e89 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -64,6 +64,7 @@ cc_library(
     hdrs = ["client.h"],
     deps = [
         ":global_data",
+        ":xla_computation",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:service_interface",
@@ -73,7 +74,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
@@ -100,12 +100,12 @@ cc_library(
     deps = [
         ":client",
         ":executable_build_options",
+        ":xla_computation",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
@@ -114,6 +114,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:source_map_util",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "@llvm//:support",
@@ -126,11 +127,11 @@ cc_library(
     hdrs = ["compile_only_client.h"],
     deps = [
         ":client",
+        ":xla_computation",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -174,3 +175,60 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
     ],
 )
+
+cc_library(
+    name = "xla_computation",
+    srcs = ["xla_computation.cc"],
+    hdrs = ["xla_computation.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+    ],
+)
+
+cc_library(
+    name = "xla_builder",
+    srcs = ["xla_builder.cc"],
+    hdrs = ["xla_builder.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":padding",
+        ":sharding_builder",
+        ":xla_computation",
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "xla_builder_test",
+    srcs = ["xla_builder_test.cc"],
+    deps = [
+        ":xla_builder",
+        ":xla_computation",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 8e54311bade531309c70fa045b1e95eb4fce844d..d0ce5e8a6afa262d4cffdfe8431aab570ffd28df 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index d751e183dd6af85094000d39f85cd205ce68cafc..be50cebfcc0e3c19002635dbd280b14048aa0c93 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 332c96503637344d56e363e19db4880c37ca9684..a551edeab0943ec5213c5cb035644c02c3cf54d7 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 77ba474cf6915ebe56eb7f847f59e52d8b73ffdd..789daf4728d105c3fb7b640cae8248c389eb4c2b 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -29,8 +29,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -45,7 +45,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -58,7 +58,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -72,7 +72,7 @@ cc_library(
         ":constants",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -86,7 +86,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -101,7 +101,7 @@ cc_library(
         ":constants",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
     ],
 )
@@ -115,7 +115,7 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -131,7 +131,7 @@ cc_library(
         ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/core:lib",
     ],
 )
@@ -150,8 +150,8 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 978fc40f3492cd7d9d7831c370b287bf45e6d3e0..9225b1acd69c214d6f08a45372a8082ed789c18c 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -94,16 +94,18 @@ XlaComputation CreateScalarMinComputation(PrimitiveType type,
       });
 }
 
-XlaComputation CreateScalarAndComputation(XlaBuilder* builder) {
+XlaComputation CreateScalarAndComputation(PrimitiveType type,
+                                          XlaBuilder* builder) {
   return CreateScalarComputation(
-      "and", PRED, builder,
+      "and", type, builder,
       [](XlaBuilder* b, const XlaOp& lhs, const XlaOp& rhs) {
         return And(lhs, rhs);
       });
 }
 
-XlaComputation CreateScalarOrComputation(XlaBuilder* builder) {
-  return CreateScalarComputation("or", PRED, builder,
+XlaComputation CreateScalarOrComputation(PrimitiveType type,
+                                         XlaBuilder* builder) {
+  return CreateScalarComputation("or", type, builder,
                                  [](XlaBuilder* b, const XlaOp& lhs,
                                     const XlaOp& rhs) { return Or(lhs, rhs); });
 }
@@ -112,7 +114,7 @@ XlaOp Any(XlaOp predicates) {
   XlaBuilder* builder = predicates.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     auto f = ConstantR0<bool>(builder, false);
-    XlaComputation logical_or = CreateScalarOrComputation(builder);
+    XlaComputation logical_or = CreateScalarOrComputation(PRED, builder);
     TF_ASSIGN_OR_RETURN(const Shape& predicates_shape,
                         builder->GetShape(predicates));
     std::vector<int64> all_dimensions(ShapeUtil::Rank(predicates_shape));
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index d0b916e8c8f742406caad0571d6e99224ed81404..632e8cc8bc64fad236a0226c6e93079aadde7050 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -45,10 +45,12 @@ XlaComputation CreateScalarMinComputation(PrimitiveType type,
                                           XlaBuilder* builder);
 
 // Creates a scalar logical AND computation and returns it.
-XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
+XlaComputation CreateScalarAndComputation(PrimitiveType type,
+                                          XlaBuilder* builder);
 
 // Creates a scalar logical OR computation and returns it.
-XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
+XlaComputation CreateScalarOrComputation(PrimitiveType type,
+                                         XlaBuilder* builder);
 
 // Returns whether any predicate in "predicates" is set.
 //
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index b47f5243f008ecb2045456e4505d1a571fbed745..0c8a9b8cc02ba0c1ebdf6a060d4b99262dceb178 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <type_traits>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index f1e3439862344c01af15ec0571155ca46a579e54..f4320f65c1f76d4d4c384110b39d6606773aaf01 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index a6d606f9449755fd419cab9aaebf34e1de52a414..0221de7672c7b7c02b1f8b9c7ff4f92151e567c6 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -25,11 +25,9 @@ XlaOp Sqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, 0.5)); }
 
 XlaOp Rsqrt(XlaOp operand) { return Pow(operand, ScalarLike(operand, -0.5)); }
 
-XlaOp Square(XlaOp operand) { return Pow(operand, ScalarLike(operand, 2.0)); }
+XlaOp Square(XlaOp operand) { return operand * operand; }
 
-XlaOp Reciprocal(XlaOp operand) {
-  return Pow(operand, ScalarLike(operand, -1.0));
-}
+XlaOp Reciprocal(XlaOp operand) { return ScalarLike(operand, 1.0) / operand; }
 
 namespace {
 
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index d003d529cc316dfde63f76284f98ae698e1d8034..13db2325569cf2e25e3ff1200adf4b2544dc2f73 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 1df287d7db2fb5498900d1bff51b621915a6b0af..14c259a7fa2a47642663b65d2785e5bbdc040cfd 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
index cdbeb189f4a23110cbcf995694f5dda7c423775d..1c91237ae1574f92cda78c9bddc6f4ac1d68f47c 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ b/tensorflow/compiler/xla/client/lib/numeric.cc
@@ -79,26 +79,59 @@ XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
   return ConvertElementType(indicator, type);
 }
 
-XlaOp Diagonal(XlaOp x) {
+XlaOp GetMatrixDiagonal(XlaOp x) {
   XlaBuilder* builder = x.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
     const int64 n_dims = ShapeUtil::Rank(shape);
     TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
     const int64 n = shape.dimensions(n_dims - 1);
+    tensorflow::gtl::ArraySlice<int64> major_dims(
+        AsInt64Slice(shape.dimensions()), /*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+
+    // TPUs don't support S64 add reduction at the moment. But fortunately
+    // OR-reductions work just as well for integers.
+    XlaComputation reducer =
+        primitive_util::IsIntegralType(shape.element_type())
+            ? CreateScalarOrComputation(shape.element_type(), builder)
+            : CreateScalarAddComputation(shape.element_type(), builder);
+
+    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+  });
+}
+
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
     const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
     tensorflow::gtl::ArraySlice<int64> major_dims(
         AsInt64Slice(shape.dimensions()), /*pos=*/0, /*len=*/n_dims - 2);
     auto a = Iota(builder, U32, n);
     auto b = Iota(builder, U32, m);
-    auto indicator = Eq(a, Broadcast(b, {n}), /*broadcast_dimensions=*/{0});
+    xla::XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
     auto mask = Broadcast(indicator, major_dims);
-    XlaComputation add =
-        CreateScalarAddComputation(shape.element_type(), builder);
-    auto diag = Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                       add, {n_dims - 1});
-    return diag;
+
+    return Select(mask, x, Zeros(builder, shape));
   });
 }
 
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/numeric.h
index 3ec084636b85eea3e9d745a794d335ef19e87f51..efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/numeric.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -29,8 +29,19 @@ XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
 
-// Get the diagonals of the last two dimensions.
-XlaOp Diagonal(XlaOp x);
+// Get the diagonals of the last two dimensions. If 'x' has shape
+// [..., M, N], then the output has shape [..., min(M, N)], containing the
+// diagonal elements (i.e., with indices [..., i, i]).
+XlaOp GetMatrixDiagonal(XlaOp x);
+
+// Get the upper or lower triangle part of the last two dimensions
+XlaOp Triangle(XlaOp x, bool lower);
+
+// Get the upper triangle part of the last two dimensions
+XlaOp UpperTriangle(XlaOp x);
+
+// Get the lower triangle part of the last two dimensions
+XlaOp LowerTriangle(XlaOp x);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/numeric_test.cc
index bc8a73e9d793ef8f65c321759e03b0de75edd500..8a96ec68d2dca8485215258b1f6731b934e6f2a8 100644
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ b/tensorflow/compiler/xla/client/lib/numeric_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -24,8 +24,15 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using NumericTest = ClientLibraryTestBase;
+class NumericTest : public ClientLibraryTestBase {
+ protected:
+  template <typename T>
+  void TestMatrixDiagonal();
+};
 
+// TODO(b/64798317): Delete this test case once xla::IotaGen is converted to
+// xla::Iota. This test is already implemented for xla::IotaGen in
+// xla/tests/iota_test.cc.
 XLA_TEST_F(NumericTest, Iota) {
   XlaBuilder builder(TestName());
   Iota(&builder, S32, 10);
@@ -33,5 +40,39 @@ XLA_TEST_F(NumericTest, Iota) {
   ComputeAndCompareR1<int32>(&builder, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {});
 }
 
+XLA_TEST_F(NumericTest, Triangle) {
+  XlaBuilder builder(TestName());
+  Array3D<int32> input(2, 3, 4);
+  input.FillIota(0);
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<int32>(input, 0, "a", &builder, &a);
+  LowerTriangle(a);
+  Array3D<int32> expected({{{0, 0, 0, 0}, {4, 5, 0, 0}, {8, 9, 10, 0}},
+                           {{12, 0, 0, 0}, {16, 17, 0, 0}, {20, 21, 22, 0}}});
+
+  ComputeAndCompareR3<int32>(&builder, expected, {a_data.get()});
+}
+
+template <typename T>
+void NumericTest::TestMatrixDiagonal() {
+  XlaBuilder builder("GetMatrixDiagonal");
+  Array3D<T> input(2, 3, 4);
+  input.FillIota(0);
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<T>(input, 0, "a", &builder, &a);
+  GetMatrixDiagonal(a);
+  Array2D<T> expected({{0, 5, 10}, {12, 17, 22}});
+
+  ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
+}
+
+XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+
+XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+
+XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index 299a6ac2b630e94567becc3ec139b8c24eab396a..3a744148fba9957c10c825c00d500960f134396c 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/numeric.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/casts.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index ac86390239668eeff1ad9eed0f6c82e10d5db004..ad000b1fa1d0655c8fccc0bb33379f2499b77f26 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <array>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 534c5098683e8984fe013225e7de2fc48b1bbc1f..b1a776b8b84eb0954e0d874d1b707e46c92f6389 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index dc613099e2b42a60d0c11a654ab5cd41f8bd4f6f..03695ce2a339735e3e49522f4fe1bbf2d83a3834 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 5f9710914bd0ceff55f5b0a2db05e553ce8bd637..e7250e11d5e59bb01026d5cf304901d17fd2ba42 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <utility>
 
 #include "llvm/ADT/Triple.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 
 using xla::source_map_util::InvalidParameterArgument;
@@ -29,8 +31,8 @@ using xla::source_map_util::InvalidParameterArgument;
 namespace xla {
 
 namespace {
-StatusOr<Backend::StreamPtr> BorrowStreamForDevice(int device_ordinal,
-                                                   Backend* backend) {
+StatusOr<StreamPool::Ptr> BorrowStreamForDevice(int device_ordinal,
+                                                Backend* backend) {
   if (device_ordinal < 0) {
     device_ordinal = backend->default_device_ordinal();
   }
@@ -141,7 +143,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
   TF_RETURN_IF_ERROR(
       ValidateExecutionOptions(arguments, run_options, *backend_));
 
-  Backend::StreamPtr stream;
+  StreamPool::Ptr stream;
   if (run_options.stream() == nullptr) {
     // NB!  The lifetime of `stream` needs to match the lifetime of
     // `actual_options` (otherwise we will end up using a returned stream in
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 4d9e0d7cd9d6ddebead1e12b23e94b529038039b..ae23809261757c637ab4aec036750c371ac60cdc 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
similarity index 99%
rename from tensorflow/compiler/xla/client/xla_client/xla_builder.cc
rename to tensorflow/compiler/xla/client/xla_builder.cc
index ced26fc2eda4b5744cb990fe109cfabf87ee5020..53be5a79c23438e103e353b8c5fc0e2446ad78c0 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #include <functional>
 #include <numeric>
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -2873,4 +2874,11 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                                           grad_output, epsilon, feature_index);
 }
 
+XlaOp IotaGen(XlaBuilder* builder, PrimitiveType type, int64 size) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = ShapeUtil::MakeShape(type, {size});
+  return builder->ReportErrorOrReturn(
+      builder->AddInstruction(std::move(instr), HloOpcode::kIota));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae331407d6cbb08f8bfc25baabbedd1ba897231f
--- /dev/null
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -0,0 +1,2241 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
+
+#include <map>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "tensorflow/compiler/xla/client/padding.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stacktrace.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+class XlaBuilder;
+
+// This represents an instruction that has been enqueued using the XlaBuilder.
+// This is used to pass to subsequent computations that depends upon the
+// instruction as an operand.
+class XlaOp {
+ public:
+  XlaOp() : handle_(-1), builder_(nullptr) {
+    static_assert(std::is_trivially_destructible<XlaOp>::value,
+                  "XlaOp should be trivially destructible");
+  }
+  ~XlaOp() = default;
+
+  // Precondition: !IsUninitialized().
+  //
+  // It's very common to do foo.builder()->bar().  Without this precondition, if
+  // foo.builder() is null, the call to bar will segfault at some point possibly
+  // deep in the callstack when we finally dereference `this`.  The precondition
+  // lets us avoid this tricky-to-debug problem.
+  XlaBuilder* builder() const {
+    CHECK(builder_ != nullptr);
+    return builder_;
+  }
+
+  // Returns true if the XlaOp represents valid, non-erroneous value.
+  bool valid() const { return handle_ >= 0; }
+
+  // Returns true if the XlaOp was created by the XlaOp() constructor and
+  // not returned by a builder.
+  bool IsUninitialized() const { return builder_ == nullptr; }
+
+  bool IsIdenticalTo(const XlaOp& rhs) const {
+    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
+    out << op.handle();
+    return out;
+  }
+
+ private:
+  explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {}
+  XlaOp(int64 handle, XlaBuilder* builder)
+      : handle_(handle), builder_(builder) {}
+
+  int64 handle() const { return handle_; }
+
+  friend class XlaBuilder;
+
+  // < 0 means "invalid handle".
+  int64 handle_;
+
+  // Not owned. Non-null for any handle returned by XlaBuilder, even if the
+  // handle is invalid.
+  XlaBuilder* builder_;
+};
+
+// Arithmetic operator overloads for the XlaOp type.
+XlaOp operator-(const XlaOp& x);
+XlaOp operator+(const XlaOp& x, const XlaOp& y);
+XlaOp operator-(const XlaOp& x, const XlaOp& y);
+XlaOp operator*(const XlaOp& x, const XlaOp& y);
+XlaOp operator/(const XlaOp& x, const XlaOp& y);
+XlaOp operator%(const XlaOp& x, const XlaOp& y);
+
+// Bitwise operator overloads for the XlaOp type.
+XlaOp operator~(const XlaOp& x);
+XlaOp operator&(const XlaOp& x, const XlaOp& y);
+XlaOp operator|(const XlaOp& x, const XlaOp& y);
+XlaOp operator^(const XlaOp& x, const XlaOp& y);
+XlaOp operator<<(const XlaOp& x, const XlaOp& y);
+// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs
+// a right logical shift.
+XlaOp operator>>(const XlaOp& x, const XlaOp& y);
+
+// We don't overload the relational operators (==, !=, <, <=, >, >=) because the
+// semantics might be surprising since their result types are usually 'bool'.
+// Further programmers may expect == to be a structural equality.
+// We also choose not to overload any of the mutating operators (e.g., +=, -=)
+// because the semantics might be misleading — XLA computations are immutable.
+
+// A convenient interface for building up computations.
+//
+// Thread-compatible.
+class XlaBuilder {
+ public:
+  // computation_name: name to use for the built computation.
+  XlaBuilder(const string& computation_name);
+
+  XlaBuilder(const XlaBuilder&) = delete;
+  XlaBuilder& operator=(const XlaBuilder&) = delete;
+
+  ~XlaBuilder();
+
+  // Returns the computation name.
+  const string& name() const { return name_; }
+
+  // Sets OpMetadata that will be added to all instructions until cleared.
+  //
+  // OpMetadata is often applied to a series of XLA HLO instructions. As a
+  // result, OpMetadata is set on the Computation Builder. All subsequent
+  // instructions generated via this Computation Builder will have the same
+  // OpMetadata attached until a call to ClearOpMetadata.
+  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
+
+  // Clears the HloMetadata state.
+  void ClearOpMetadata() { metadata_.Clear(); }
+
+  // Sets an OpSharding that will be attached to all instructions until cleared.
+  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
+
+  // Clears the sharding. Ops will be sharded according to the default placement
+  // policy.
+  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
+
+  // Returns the OpSharding that will be attached to all instructions.
+  const tensorflow::gtl::optional<OpSharding>& sharding() const {
+    return sharding_;
+  }
+
+  // Sets the builder to a mode where it will die immediately when an error is
+  // encountered, rather than producing it in a deferred fashion when Build() is
+  // called (which is the default).
+  void set_die_immediately_on_error(bool enabled) {
+    die_immediately_on_error_ = enabled;
+  }
+
+  // Default dimension numbers used for a 2D convolution.
+  static constexpr int64 kConvBatchDimension = 0;
+  static constexpr int64 kConvFeatureDimension = 1;
+  static constexpr int64 kConvFirstSpatialDimension = 2;
+  static constexpr int64 kConvSecondSpatialDimension = 3;
+  static constexpr int64 kConvKernelOutputDimension = 0;
+  static constexpr int64 kConvKernelInputDimension = 1;
+  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
+  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
+
+  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
+  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
+  // the kernel operand
+  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
+  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
+      int num_spatial_dims = 2);
+
+  // Returns an error if the convolution dimension numbers have conflicts.
+  static Status Validate(const ConvolutionDimensionNumbers& dnum);
+
+  // Returns a new XlaBuilder whose resultant Computation is used only by this
+  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
+  // behavior as the parent.
+  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
+
+  // Builds the computation with the requested operations, or returns a non-ok
+  // status. Note that all ops that have been enqueued will be moved to the
+  // computation being returned.
+  StatusOr<XlaComputation> Build();
+
+  // Builds the computation with the requested operations, or notes an error in
+  // the parent XlaBuilder and returns an empty computation if building failed.
+  // This function is intended to be used where the returned XlaComputation is
+  // only used by the parent XlaBuilder and hence further operation on the
+  // returned XlaComputation will simply be error'ed out if an error occurred
+  // while building this computation. If the built computation is to be used by
+  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
+  // instead.
+  XlaComputation BuildAndNoteError();
+
+  // Returns a subgraph that roots on the given root. If the root is not a
+  // compile-time constant (see `IsConstant`), returns an error.
+  //
+  // This will copy the needed ops/computations to the subgraph.
+  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
+
+  // Returns the first error that was encountered while building the
+  // computation. When an error is encountered, by default we return a vacuous
+  // XlaOp and inform the user of the error that occurred while
+  // building the computation when they make a final call to Build().
+  //
+  // See also set_die_immediately_on_error().
+  Status first_error() const { return first_error_; }
+
+  // Returns the shape of the given op.
+  StatusOr<Shape> GetShape(const XlaOp& op) const;
+
+  // Returns the (inferred) result for the current computation's shape.
+  StatusOr<ProgramShape> GetProgramShape() const;
+
+  // Reports an error to the builder, by
+  // * storing it internally and capturing a backtrace if it's the first error
+  //   (this deferred value will be produced on the call to
+  //    Build()/GetShape()/...)
+  // * dying if die_immediately_on_error_ is true.
+  // Returns an XlaOp with an invalid handle but a valid builder. This value can
+  // be returned in place of a value in APIs that return an XlaOp.
+  XlaOp ReportError(const Status& error);
+
+  // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
+  // If the Status was an error, reports the error to builder and returns an
+  // invalid XlaOp handle.
+  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
+
+  // A helper function that runs a function that returns a StatusOr<XlaOp> and
+  // returns an XlaOp.
+  XlaOp ReportErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
+
+  // Returns true if 'operand' is a compile-time constant. A compile-time
+  // constant does not depend on any parameters, or on stateful operators such
+  // as `RngNormal` or `Infeed`.
+  //
+  // This tests whether a computation is a compile-time constant without
+  // evaluating the computation.
+  StatusOr<bool> IsConstant(const XlaOp& operand) const;
+
+ private:
+  // Enqueues a "retrieve parameter value" instruction for a parameter that was
+  // passed to the computation.
+  XlaOp Parameter(int64 parameter_number, const Shape& shape,
+                  const string& name);
+
+  // Enqueues a constant with the value of the given literal onto the
+  // computation.
+  XlaOp ConstantLiteral(const LiteralSlice& literal);
+
+  // Enqueues a constant onto the computation. Methods are templated on the
+  // native host type (NativeT) which corresponds to a specific XLA
+  // PrimitiveType as given in the following table:
+  //
+  //  Native Type   PrimitiveType
+  // -----------------------------
+  //   bool           PRED
+  //   int32          S32
+  //   int64          S64
+  //   uint32         U32
+  //   uint64         U64
+  //   float          F32
+  //   double         F64
+  //
+  // Note: not all primitive types defined in xla_data.proto have a
+  // corresponding native type yet.
+  template <typename NativeT>
+  XlaOp ConstantR0(NativeT value);
+  template <typename NativeT>
+  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  XlaOp ConstantR2(
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                    const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
+                                        const Layout& layout);
+  template <typename NativeT>
+  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Enqueues a rank one constant (vector) onto the computation. The vector has
+  // size 'length' and every element has the value 'value'.
+  template <typename NativeT>
+  XlaOp ConstantR1(int64 length, NativeT value);
+
+  // Adds dimensions to an array by duplicating the data in the array.
+  //
+  // The new dimensions are inserted on the left, i.e. if
+  // broadcast_sizes has values {a0, ..., aN} and the operand shape
+  // has dimensions {b0, ..., bM} then the shape of the output has
+  // dimensions {a0, ..., aN, b0, ..., bM}.
+  //
+  // The new dimensions index into copies of the operand, i.e.
+  //
+  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+  XlaOp Broadcast(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  // Performs in-dimension-style broadcast.
+  //
+  // Operand specifies the input to be broadcast. "shape" is expected output
+  // shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
+  // Dimension numbers in broadcast_dimensions map to individual dimensions
+  // of the operand, and specify what dimension of the output shape they
+  // should be broadcast.
+  // e.g.
+  // Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
+  // and dimension of shape is [2,2].
+  // Specifying {1} as brodcast_dimension will generate output
+  // [1 , 2]
+  // [1 , 2]
+  // On the other hand, specifying {0} as broadcast_dimension
+  // will generate output
+  // [1 , 1]
+  // [2 , 2]
+  XlaOp BroadcastInDim(
+      const XlaOp& operand, const Shape& shape,
+      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Enqueues a pad operation onto the computation that pads the given value on
+  // the edges as well as between the elements of the input. padding_config
+  // specifies the padding amount for each dimension.
+  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+            const PaddingConfig& padding_config);
+
+  // Enqueues an operation onto the computation that flattens the operand based
+  // on the dimension order (major/slowest-varying to minor/fastest-varying)
+  // given, followed by reshaping it into the shape with the given dimension
+  // sizes (also major to minor). Conceptually, this is a limited form of
+  // "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> dimensions,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Enqueues an operation onto the computation that collapses the operand, from
+  // first to last dimension (C order), then reshapes it to the given dimension
+  // sizes. Conceptually, this is a limited form of "shape casting".
+  XlaOp Reshape(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  // Wrapper for Reshape.
+  // Enqueues an operation to collapse the provided dimensions; e.g. an
+  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+  // be a consecutive, in-order subsequence of the operand dimensions.
+  //
+  // Note that collapsing a single dimension does nothing:
+  //
+  //    {256} collapsing {0} => {256}
+  //    {1} collapsing {0} => {1}
+  //
+  // Collapsing multiple dimensions produces a single result dimension:
+  //
+  //    {256, 2} collapsing {0,1} => {512}
+  //    {256, 2, 3} collapsing {0,1} => {512, 3}
+  //
+  // This could potentially cause data to be moved -- it provides a more
+  // structured form of reshaping than an arbitrary Reshape operation.
+  XlaOp Collapse(const XlaOp& operand,
+                 tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a slice operation onto the computation that slices the operand
+  // from the start indices to the limit indices; e.g.
+  //
+  //        x
+  //   [ 0 1 2 3 ]
+  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+  //   [ 8 9 a b ]
+  //
+  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+  // range notation.
+  // The strides parameter determines the stride over the slice
+  XlaOp Slice(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> start_indices,
+              tensorflow::gtl::ArraySlice<int64> limit_indices,
+              tensorflow::gtl::ArraySlice<int64> strides);
+
+  // Enqueues a slice operation in a given dimension, taking all other
+  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+  // for:
+  //
+  //  array[:, 2:4:1, :]
+  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                   int64 stride, int64 dimno);
+
+  // Enqueues a slice operation onto the computation that slices the 'operand'
+  // from dynamic start indices which are passed in 'start_indices'.
+  // The size of the slice in each dimension is passed in 'slice_sizes',
+  // which specify the end point of exclusive slice intervals in each
+  // dimension [start, start + size).
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo input dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  // Enqueues a dynamic update slice operation onto the computation, which
+  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+  // The shape of 'update' determines the shape of the slice of 'operand'
+  // which is updated.
+  // The indices specified in 'start_indices' specify the offset of the slice
+  // of 'operand' which is updated.
+  //
+  //               update = {10, 11} // calculated at runtime.
+  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+  //   [7 8 9]                                                  [7 8  9 ]
+  //
+  // The shape of 'start_indices' must be rank == 1, with dimension size
+  // equal to the rank of the 'operand'.
+  // Slice index calculations are computed modulo update dimension sizes to
+  // prevent dynamic start indices from generating out-of-bound array accesses.
+  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                           const XlaOp& start_indices);
+
+  // Enqueues a concatenate instruction onto the computation. 'operands' must
+  // have >= 1 entry.
+  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    int64 dimension);
+
+  // Enqueue a tracing operation onto the computation; the computation will emit
+  // a logging message with the operand.
+  void Trace(const string& tag, const XlaOp& operand);
+
+  // Enqueues a conditional-move-like select operation onto the computation;
+  // predicated on pred, selects between on_true and on_false.
+  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+  // Enqueues a tuple-creation instruction onto the computation.
+  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+  // Enqueues a tuple-element-get instruction onto the computation.
+  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+  // Enqueues an equal-to comparison instruction onto the computation.
+  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a not-equal comparison instruction onto the computation.
+  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-or-equal comparison instruction onto the computation.
+  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a greater-than comparison instruction onto the computation.
+  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-than comparison instruction onto the computation.
+  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a less-or-equal comparison instruction onto the computation.
+  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a dot instruction onto the computation.
+  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+  // Enqueues a general dot instruction onto the computation.
+  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                   const DotDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, which uses the
+  // default convolution dimension numbers.
+  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+             tensorflow::gtl::ArraySlice<int64> window_strides,
+             Padding padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration in the format returned by MakePadding().
+  XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided dimension numbers configuration.
+  XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration as well as the dimension numbers.
+  XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues a convolution instruction onto the computation, with the caller
+  // provided padding configuration, dilation factors and dimension numbers.
+  XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+
+  // Enqueues an FFT instruction onto the computation, of the given type and
+  // with the given FFT length.
+  XlaOp Fft(const XlaOp& operand, FftType fft_type,
+            tensorflow::gtl::ArraySlice<int64> fft_length);
+
+  // Enqueues an infeed instruction onto the computation, which writes data of
+  // the given shape to the infeed buffer of the device.
+  XlaOp Infeed(const Shape& shape, const string& config = "");
+  XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                        const string& config = "");
+
+  // Enqueues an outfeed instruction onto the computation. This instruction
+  // generates outgoing data transfers for the given data.
+  //
+  // shape_with_layout communicates the laid out shape that we want to outfeed
+  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+  // will occur.
+  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+               const string& outfeed_config);
+  XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                         const Shape& shape_with_layout,
+                         const string& outfeed_config);
+
+  // Enqueues a call instruction onto the computation.
+  XlaOp Call(const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+  // Enqueues a custom call instruction onto the computation.
+  // During code generation, a call instruction is emitted which targets a
+  // symbol with the name |call_target_name|.  The |operands| are passed to the
+  // call instruction.  |shape| is the resultant shape.
+  XlaOp CustomCall(const string& call_target_name,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const Shape& shape);
+
+  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
+  // During code generation, host send and receive operations will be generated
+  // to transfer |operands| to the host and a single result of |shape| back to
+  // the device.  Host send/recv operations are emitted using |channel_name|.
+  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+  // instruction scheduling.
+  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
+                    const string& channel_name, int64 cost_estimate_ns,
+                    const Shape& shape);
+
+  // The following methods enqueue element-wise binary arithmetic operations
+  // onto the computation. The shapes of the operands have to match unless one
+  // of the operands is a scalar, or an explicit broadcast dimension is given
+  // (see g3doc for more details).
+
+  // Enqueues a complex compose instruction onto the computation.
+  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a complex conjugate instruction onto the computation.
+  XlaOp Conj(const XlaOp& operand);
+
+  // Enqueues an add instruction onto the computation.
+  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a subtract instruction onto the computation.
+  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a multiply instruction onto the computation.
+  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a divide instruction onto the computation.
+  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a remainder instruction onto the computation.
+  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a max instruction onto the computation.
+  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues a min instruction onto the computation.
+  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Element-wise logical operators
+  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  XlaOp Not(const XlaOp& operand);
+
+  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+  XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Reduces an array among the provided dimensions, given "computation" as a
+  // reduction operator.
+  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+               const XlaComputation& computation,
+               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+  // Convenience wrapper around the above that reduces all the dimensions in the
+  // operand shape.
+  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                  const XlaComputation& computation);
+
+  // Enqueues a windowed reduce instruction onto the computation.
+  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                     const XlaComputation& computation,
+                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                     tensorflow::gtl::ArraySlice<int64> window_strides,
+                     Padding padding);
+
+  // As ReduceWindow(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+  // Returns the sum of the operand value within each subgroup of replicas. All
+  // replicas supply one input to the sum and all replicas receive the resulting
+  // sum for each subgroup.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+
+  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
+  // AllReduce means doing a reduction on the input operand cross cores and then
+  // broadcasting the reduction result to those cores. The reduction function is
+  // defined by `computation`, which should be a commutative computation on
+  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
+  // configured by:
+  //
+  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // - `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
+          tensorflow::gtl::nullopt);
+
+  // Enqueues an operation that scatters the `source` array to the selected
+  // indices of each window.
+  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                         tensorflow::gtl::ArraySlice<int64> window_strides,
+                         Padding padding, const XlaOp& source,
+                         const XlaOp& init_value,
+                         const XlaComputation& scatter);
+
+  // As SelectAndScatter(), but the padding is given in the format
+  // returned by MakePadding().
+  XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+
+  // Enqueues an abs instruction onto the computation.
+  XlaOp Abs(const XlaOp& operand);
+
+  // Enqueues a atan2 instruction onto the computation.
+  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an exp instruction onto the computation.
+  XlaOp Exp(const XlaOp& operand);
+
+  // Enqueues an expm1 instruction onto the computation.
+  XlaOp Expm1(const XlaOp& operand);
+
+  // Enqueues a floor instruction onto the computation.
+  XlaOp Floor(const XlaOp& operand);
+
+  // Enqueues a ceil instruction onto the computation.
+  XlaOp Ceil(const XlaOp& operand);
+
+  // Enqueues a round instruction onto the computation, rounding to nearest even
+  // with half-way cases rounding away from zero.
+  XlaOp Round(const XlaOp& operand);
+
+  // Enqueues an log instruction (natural logarithm) onto the computation.
+  XlaOp Log(const XlaOp& operand);
+
+  // Enqueues an log1p instruction (log(x+1)) onto the computation.
+  XlaOp Log1p(const XlaOp& operand);
+
+  // Enqueues a sign instruction onto the computation.
+  XlaOp Sign(const XlaOp& operand);
+
+  // Enqueues a count leading zeros instruction onto the computation.
+  XlaOp Clz(const XlaOp& operand);
+
+  // Enqueues a cosine instruction onto the computation.
+  XlaOp Cos(const XlaOp& operand);
+
+  // Enqueues a sine instruction onto the computation.
+  XlaOp Sin(const XlaOp& operand);
+
+  // Enqueues a tanh instruction onto the computation.
+  XlaOp Tanh(const XlaOp& operand);
+
+  // Enqueues a real-part instruction onto the computation.
+  XlaOp Real(const XlaOp& operand);
+
+  // Enqueues an imaginary-part instruction onto the computation.
+  XlaOp Imag(const XlaOp& operand);
+
+  // Enqueues a lhs^rhs computation onto the computation.
+  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+  // Enqueues an operator that tests if the operand's values are finite, i.e.,
+  // not Inf or NaN. Defined only for floating-point types. Returns an array of
+  // booleans with the same shape where entries are true iff the corresponding
+  // entry was NaN.
+  XlaOp IsFinite(const XlaOp& operand);
+
+  // Enqueues a convert instruction onto the computation that changes the
+  // element type of the operand array to primitive_type.
+  XlaOp ConvertElementType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a no-op instruction onto the computation that changes
+  // the element type of the operand array to primitive_type. The
+  // bit-widths of the source and destination element types must be
+  // identical.
+  XlaOp BitcastConvertType(const XlaOp& operand,
+                           PrimitiveType new_element_type);
+
+  // Enqueues a negate instruction onto the computation.
+  XlaOp Neg(const XlaOp& operand);
+
+  // Enqueues a transpose instruction onto the computation.
+  XlaOp Transpose(const XlaOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> permutation);
+
+  // Enqueues a reverse instruction onto the computation. The order of the
+  // elements in the given dimensions is reversed (i.e., the element at index i
+  // is moved to index dimension_size - 1 - i).
+  XlaOp Rev(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  // Enqueues a sort (as increasing order) instruction onto the computation.
+  // If only keys are provided:
+  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
+  // of keys, in ascending order.
+  // * If the keys have higher rank, the keys are sorted along the provided
+  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
+  // value of 0 will indepenently sort every column, and a dimension value of 1
+  // will independently sort each row. If no dimension number is provided, then
+  // the last dimension is chosen by default.
+  //
+  // If both keys and values are provided:
+  // * The keys and the values must tensors with the same dimensions. The
+  // element types of the tensors may be different.
+  // * The result is a tuple that consists of a sorted tensor of keys (along the
+  // provided dimension, as above) as the first element, and a tensor with their
+  // corresponding values as the second element.
+  XlaOp Sort(XlaOp keys,
+             tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+             int64 dimension = -1);
+
+  // Enqueues a clamp instruction onto the computation.
+  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+  // Enqueues a map instruction onto the computation.
+  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
+            const XlaComputation& computation,
+            tensorflow::gtl::ArraySlice<int64> dimensions,
+            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+  // Enqueues a N(mu, sigma) random number generation instruction onto the
+  // computation.
+  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+  // Enqueues a U(a, b) random number generation instruction onto the
+  // computation. Returns values in the semi-open interval [a, b).
+  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+  // Enqueues a while node onto the computation.
+  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+              const XlaOp& init);
+
+  // Enqueues a conditional node onto the computation.
+  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                    const XlaComputation& true_computation,
+                    const XlaOp& false_operand,
+                    const XlaComputation& false_computation);
+
+  // Enqueues a ReducePrecision node onto the computation.
+  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                        const int mantissa_bits);
+
+  // Enqueues a Gather node onto the computation.
+  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+               const GatherDimensionNumbers& dimension_numbers,
+               tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+  // Enqueues a Send node onto the computation for device-to-device
+  // communication, to send the given operand to a Recv instruction that shares
+  // the same channel handle.
+  void Send(const XlaOp& operand, const ChannelHandle& handle);
+  XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                      const ChannelHandle& handle);
+
+  // Enqueues a Send node which sends data to the host.
+  XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                   const Shape& shape_with_layout, const ChannelHandle& handle);
+
+  // Enqueues a Recv node which receives data from the host.
+  XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                     const ChannelHandle& handle);
+
+  // Enqueues an AfterAll operation with no operands producing a token-shaped
+  // value.
+  XlaOp CreateToken();
+
+  // Enqueues an AfterAll operation with no operands producing a token-shaped
+  // value.
+  XlaOp AfterAll(tensorflow::gtl::ArraySlice<XlaOp> tokens);
+
+  // Enqueues a Recv node onto the computation. The data comes from a Send
+  // instruction that shares the same channel handle and its shape must
+  // be the same as the given shape.
+  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
+  XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                      const ChannelHandle& handle);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+  // is the normalized result and batch_mean and batch_var are the mean and
+  // variance, respectively, across batch for the operand.
+  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                          const XlaOp& offset, float epsilon,
+                          int64 feature_index);
+
+  // Normalizes operand across spatial and batch dimensions for each feature.
+  //
+  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+  // computing `mean` and `variance` for each batch inside the operation. It
+  // uses the input `mean` and `variance` instead as estimated values. The
+  // purpose of this op is to reduce latency in inference, hence the name
+  // `BatchNormInference`.
+  //
+  // The output has the same shape as `operand`, and contains the normalized
+  // values for each batch.
+  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                           const XlaOp& offset, const XlaOp& mean,
+                           const XlaOp& variance, float epsilon,
+                           int64 feature_index);
+
+  // Calculates the gradients of a batch norm op.
+  //
+  // The inputs `batch_mean` and `batch_var` represent the mean and variance
+  // across the batch.
+  //
+  // Returns a tuple of three elements:
+  //   - grad_operand: Gradient with respect to input `operand`
+  //   - grad_offset: Gradient with respect to input `offset`
+  //   - grad_scale: Gradient with respect to input `scale`
+  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                      const XlaOp& batch_mean, const XlaOp& batch_var,
+                      const XlaOp& grad_output, float epsilon,
+                      int64 feature_index);
+
+  StatusOr<XlaOp> AddInstruction(
+      HloInstructionProto&& instr, HloOpcode opcode,
+      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
+
+  void AddCalledComputation(const XlaComputation& computation,
+                            HloInstructionProto* instr);
+
+  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
+
+  // Internal helper method that does the building for an arbitrary unary op.
+  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
+
+  // Internal helper method that does the building for an arbitrary binary op.
+  // broadcast_dimensions specifies which dimensions to use for broadcasting
+  // when the operation is between tensors of different ranks.
+  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
+                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that does the building for an arbitrary ternary op.
+  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
+                  const XlaOp& ehs);
+
+  XlaOp RngOp(RandomDistribution distribution,
+              tensorflow::gtl::ArraySlice<XlaOp> parameters,
+              const Shape& shape);
+
+  StatusOr<XlaOp> InDimBroadcast(
+      const Shape& shape, const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  // Internal helper method that creates a sequence of instructions that
+  // performs an explicit broadcast of the operand to the target shape.
+  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
+                                       const XlaOp& operand);
+
+  // Internal helper method for creating a Reshape op with the already inferred
+  // shape.
+  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
+
+  // Returns the (inferred) result for the program shape for the current
+  // computation and fills the root_id in the pointer.
+  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
+
+  // Returns shapes for the operands.
+  StatusOr<std::vector<Shape>> GetOperandShapes(
+      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
+
+  // A visitor which checks whether an operation is a compile-time constant,
+  // meaning that it doesn't depend on any parameters, or on any stateful
+  // operation such as `RngNormal` or `Infeed`. The visitor walks the
+  // computation starting at a given operation and sets is_constant to false iff
+  // a parameter or stateful operation is encountered.
+  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
+                         bool* is_constant) const;
+
+  // Checks bounds for convolution parameters.
+  Status VerifyConvolution(
+      const Shape& lhs_shape, const Shape& rhs_shape,
+      const ConvolutionDimensionNumbers& dimension_numbers) const;
+
+  // Helper function for creating a Window proto from user-supplied data.
+  // Returns error if the user-supplied data was invalid.
+  StatusOr<Window> MakeWindow(
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
+
+  string name_;  // Name to use for the built computation.
+
+  // The first error encountered while building the computation.
+  // This is OK until the first error is encountered.
+  Status first_error_;
+
+  // The saved stack trace from the point at which the first error occurred.
+  tensorflow::SavedStackTrace first_error_backtrace_;
+
+  // The instructions of this computation.
+  std::vector<HloInstructionProto> instructions_;
+
+  // The embedded computations used by this computation. Each computation was
+  // the entry computation of some XlaComputation, the key is the unique id of
+  // that XlaComputation.
+  std::map<int64, HloComputationProto> embedded_;
+
+  // The unique parameter numbers.
+  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
+
+  // The metadata to attach to each op. This is structured as a "modal"-like
+  // operation, in order to simplify client code (and not sprinkle this metadata
+  // throughout the TensorFlow op kernel implementations).
+  OpMetadata metadata_;
+
+  // Sharding for this operator. This is structured as a "model"-like operation,
+  // in order to simplify client code, similar to metadata_.
+  tensorflow::gtl::optional<OpSharding> sharding_;
+
+  // Mode bit that indicates whether to die when a first error is encountered.
+  bool die_immediately_on_error_ = false;
+
+  XlaBuilder* parent_builder_{nullptr};
+
+  friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number,
+                         const Shape& shape, const string& name);
+  friend XlaOp ConstantLiteral(XlaBuilder* builder,
+                               const LiteralSlice& literal);
+  template <typename NativeT>
+  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
+  template <typename NativeT>
+  friend XlaOp ConstantR1(XlaBuilder* builder,
+                          tensorflow::gtl::ArraySlice<NativeT> values);
+  friend XlaOp ConstantR1(XlaBuilder* builder,
+                          const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR2(
+      XlaBuilder* builder,
+      std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                           const Array<NativeT>& values,
+                                           const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantFromArray(XlaBuilder* builder,
+                                 const Array<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                               const Array2D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                                     const Array2D<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                               const Array3D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                                     const Array3D<NativeT>& values);
+  template <typename NativeT>
+  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                               const Array4D<NativeT>& values,
+                                               const Layout& layout);
+  template <typename NativeT>
+  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                                     const Array4D<NativeT>& values);
+
+  template <typename NativeT>
+  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
+
+  friend XlaOp Broadcast(const XlaOp& operand,
+                         tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+  friend XlaOp BroadcastInDim(
+      const XlaOp& operand, const Shape& shape,
+      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+  friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+                   const PaddingConfig& padding_config);
+
+  friend XlaOp Reshape(const XlaOp& operand,
+                       tensorflow::gtl::ArraySlice<int64> dimensions,
+                       tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  friend XlaOp Reshape(const XlaOp& operand,
+                       tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+  friend XlaOp Collapse(const XlaOp& operand,
+                        tensorflow::gtl::ArraySlice<int64> dimensions);
+
+  friend XlaOp Slice(const XlaOp& operand,
+                     tensorflow::gtl::ArraySlice<int64> start_indices,
+                     tensorflow::gtl::ArraySlice<int64> limit_indices,
+                     tensorflow::gtl::ArraySlice<int64> strides);
+
+  friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index,
+                          int64 limit_index, int64 stride, int64 dimno);
+
+  friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                            tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                                  const XlaOp& start_indices);
+
+  friend XlaOp ConcatInDim(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<XlaOp> operands,
+                           int64 dimension);
+
+  friend void Trace(const string& tag, const XlaOp& operand);
+
+  friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true,
+                      const XlaOp& on_false);
+  friend XlaOp Tuple(XlaBuilder* builder,
+                     tensorflow::gtl::ArraySlice<XlaOp> elements);
+  friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+  friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+  friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                          const DotDimensionNumbers& dimension_numbers);
+  friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+                    tensorflow::gtl::ArraySlice<int64> window_strides,
+                    Padding padding);
+  friend XlaOp ConvWithGeneralPadding(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+  friend XlaOp ConvWithGeneralDimensions(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp ConvGeneral(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp ConvGeneralDilated(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers);
+  friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
+                   tensorflow::gtl::ArraySlice<int64> fft_length);
+  friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+                      const string& config);
+  friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+                      const string& outfeed_config);
+  friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+                    tensorflow::gtl::ArraySlice<XlaOp> operands);
+  friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
+                          tensorflow::gtl::ArraySlice<XlaOp> operands,
+                          const Shape& shape);
+  friend XlaOp HostCompute(XlaBuilder* builder,
+                           tensorflow::gtl::ArraySlice<XlaOp> operands,
+                           const string& channel_name, int64 cost_estimate_ns,
+                           const Shape& shape);
+  friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Conj(const XlaOp& operand);
+  friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Not(const XlaOp& operand);
+  friend XlaOp ShiftLeft(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp ShiftRightArithmetic(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp ShiftRightLogical(
+      const XlaOp& lhs, const XlaOp& rhs,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+                      const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+  friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                         const XlaComputation& computation);
+  friend XlaOp ReduceWindow(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+  friend XlaOp ReduceWindowWithGeneralPadding(
+      const XlaOp& operand, const XlaOp& init_value,
+      const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+  friend XlaOp CrossReplicaSum(
+      const XlaOp& operand,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids);
+  friend XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id);
+  friend XlaOp SelectAndScatter(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+  friend XlaOp SelectAndScatterWithGeneralPadding(
+      const XlaOp& operand, const XlaComputation& select,
+      tensorflow::gtl::ArraySlice<int64> window_dimensions,
+      tensorflow::gtl::ArraySlice<int64> window_strides,
+      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+      const XlaOp& source, const XlaOp& init_value,
+      const XlaComputation& scatter);
+  friend XlaOp Abs(const XlaOp& operand);
+  friend XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp Exp(const XlaOp& operand);
+  friend XlaOp Expm1(const XlaOp& operand);
+  friend XlaOp Floor(const XlaOp& operand);
+  friend XlaOp Ceil(const XlaOp& operand);
+  friend XlaOp Round(const XlaOp& operand);
+  friend XlaOp Log(const XlaOp& operand);
+  friend XlaOp Log1p(const XlaOp& operand);
+  friend XlaOp Sign(const XlaOp& operand);
+  friend XlaOp Clz(const XlaOp& operand);
+  friend XlaOp Cos(const XlaOp& operand);
+  friend XlaOp Sin(const XlaOp& operand);
+  friend XlaOp Tanh(const XlaOp& operand);
+  friend XlaOp Real(const XlaOp& operand);
+  friend XlaOp Imag(const XlaOp& operand);
+  friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+  friend XlaOp IsFinite(const XlaOp& operand);
+  // TODO(b/64798317): Finish CPU & GPU implementation, then replace xla::Iota
+  // in xla/client/lib/numeric.h with this (renamed to xla::Iota).
+  friend XlaOp IotaGen(XlaBuilder* builder, PrimitiveType type, int64 size);
+  friend XlaOp ConvertElementType(const XlaOp& operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp BitcastConvertType(const XlaOp& operand,
+                                  PrimitiveType new_element_type);
+  friend XlaOp Neg(const XlaOp& operand);
+  friend XlaOp Transpose(const XlaOp& operand,
+                         tensorflow::gtl::ArraySlice<int64> permutation);
+  friend XlaOp Rev(const XlaOp& operand,
+                   tensorflow::gtl::ArraySlice<int64> dimensions);
+  friend XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
+                    int64 dimension);
+  friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+  friend XlaOp Map(XlaBuilder* builder,
+                   tensorflow::gtl::ArraySlice<XlaOp> operands,
+                   const XlaComputation& computation,
+                   tensorflow::gtl::ArraySlice<int64> dimensions,
+                   tensorflow::gtl::ArraySlice<XlaOp> static_operands);
+  friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma,
+                         const Shape& shape);
+  friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+  friend XlaOp While(const XlaComputation& condition,
+                     const XlaComputation& body, const XlaOp& init);
+  friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                           const XlaComputation& true_computation,
+                           const XlaOp& false_operand,
+                           const XlaComputation& false_computation);
+  friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                               const int mantissa_bits);
+  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+                      const GatherDimensionNumbers& dimension_numbers,
+                      tensorflow::gtl::ArraySlice<int64> window_bounds);
+  friend void Send(const XlaOp& operand, const ChannelHandle& handle);
+  friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+                    const ChannelHandle& handle);
+  friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                                 const XlaOp& offset, float epsilon,
+                                 int64 feature_index);
+  friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                                  const XlaOp& offset, const XlaOp& mean,
+                                  const XlaOp& variance, float epsilon,
+                                  int64 feature_index);
+  friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                             const XlaOp& batch_mean, const XlaOp& batch_var,
+                             const XlaOp& grad_output, float epsilon,
+                             int64 feature_index);
+  friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                             const ChannelHandle& handle);
+  friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                             const ChannelHandle& handle);
+  friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                          const Shape& shape_with_layout,
+                          const ChannelHandle& handle);
+  friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                            const ChannelHandle& handle);
+  friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                               const string& config);
+  friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                                const Shape& shape_with_layout,
+                                const string& outfeed_config);
+  friend XlaOp CreateToken(XlaBuilder* builder);
+  friend XlaOp AfterAll(XlaBuilder* builder,
+                        tensorflow::gtl::ArraySlice<XlaOp> tokens);
+};
+
+// RAII-style object: sets the current sharding assignment in builder on
+// construction, and sets back to the previous assignment on destruction.
+class XlaScopedShardingAssignment {
+ public:
+  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
+                              tensorflow::gtl::optional<OpSharding> sharding)
+      : builder_(builder), prev_sharding_(builder->sharding()) {
+    SetSharding(sharding);
+  }
+
+  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
+  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
+      delete;
+
+  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
+
+ private:
+  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
+    if (sharding.has_value()) {
+      builder_->SetSharding(sharding.value());
+    } else {
+      builder_->ClearSharding();
+    }
+  }
+
+  xla::XlaBuilder* const builder_;
+  tensorflow::gtl::optional<OpSharding> prev_sharding_;
+};
+
+// Free functions for building XlaOps. The intention is that these will
+// become the public API for building XlaOps rather than calling methods on
+// XlaBuilder directly.
+
+// Enqueues a "retrieve parameter value" instruction for a parameter that was
+// passed to the computation.
+XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape,
+                const string& name);
+
+// Enqueues a constant with the value of the given literal onto the
+// computation.
+XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal);
+
+// Enqueues a constant onto the computation. Methods are templated on the
+// native host type (NativeT) which corresponds to a specific XLA
+// PrimitiveType as given in the following table:
+//
+//  Native Type   PrimitiveType
+// -----------------------------
+//   bool           PRED
+//   int32          S32
+//   int64          S64
+//   uint32         U32
+//   uint64         U64
+//   float          F32
+//   double         F64
+//
+// Note: not all primitive types defined in xla_data.proto have a
+// corresponding native type yet.
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder,
+                 tensorflow::gtl::ArraySlice<NativeT> values);
+XlaOp ConstantR1(XlaBuilder* builder, const tensorflow::core::Bitmap& values);
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values);
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout);
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values);
+
+// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the
+// computation. The vector has size 'length' and every element has the value
+// 'value'.
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
+
+// Adds dimensions to an array by duplicating the data in the array.
+//
+// The new dimensions are inserted on the left, i.e. if
+// broadcast_sizes has values {a0, ..., aN} and the operand shape
+// has dimensions {b0, ..., bM} then the shape of the output has
+// dimensions {a0, ..., aN, b0, ..., bM}.
+//
+// The new dimensions index into copies of the operand, i.e.
+//
+//   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
+XlaOp Broadcast(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+
+// Performs in-dimension-style broadcast.
+//
+// Operand specifies the input to be broadcast. "shape" is expected output
+// shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
+// Dimension numbers in broadcast_dimensions map to individual dimensions
+// of the operand, and specify what dimension of the output shape they
+// should be broadcast.
+// e.g.
+// Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
+// and dimension of shape is [2,2].
+// Specifying {1} as brodcast_dimension will generate output
+// [1 , 2]
+// [1 , 2]
+// On the other hand, specifying {0} as broadcast_dimension
+// will generate output
+// [1 , 1]
+// [2 , 2]
+XlaOp BroadcastInDim(
+    const XlaOp& operand, const Shape& shape,
+    const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
+
+// Enqueues a pad operation onto the computation that pads the given value on
+// the edges as well as between the elements of the input. padding_config
+// specifies the padding amount for each dimension.
+XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
+          const PaddingConfig& padding_config);
+
+// Enqueues an operation onto the computation that flattens the operand based
+// on the dimension order (major/slowest-varying to minor/fastest-varying)
+// given, followed by reshaping it into the shape with the given dimension
+// sizes (also major to minor). Conceptually, this is a limited form of
+// "shape casting".
+XlaOp Reshape(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> dimensions,
+              tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+// Enqueues an operation onto the computation that collapses the operand, from
+// first to last dimension (C order), then reshapes it to the given dimension
+// sizes. Conceptually, this is a limited form of "shape casting".
+XlaOp Reshape(const XlaOp& operand,
+              tensorflow::gtl::ArraySlice<int64> new_sizes);
+
+// Wrapper for Reshape.
+// Enqueues an operation to collapse the provided dimensions; e.g. an
+// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
+// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
+// be a consecutive, in-order subsequence of the operand dimensions.
+//
+// Note that collapsing a single dimension does nothing:
+//
+//    {256} collapsing {0} => {256}
+//    {1} collapsing {0} => {1}
+//
+// Collapsing multiple dimensions produces a single result dimension:
+//
+//    {256, 2} collapsing {0,1} => {512}
+//    {256, 2, 3} collapsing {0,1} => {512, 3}
+//
+// This could potentially cause data to be moved -- it provides a more
+// structured form of reshaping than an arbitrary Reshape operation.
+XlaOp Collapse(const XlaOp& operand,
+               tensorflow::gtl::ArraySlice<int64> dimensions);
+
+// Enqueues a slice operation onto the computation that slices the operand
+// from the start indices to the limit indices; e.g.
+//
+//        x
+//   [ 0 1 2 3 ]
+// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
+//   [ 8 9 a b ]
+//
+// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
+// range notation.
+// The strides parameter determines the stride over the slice
+XlaOp Slice(const XlaOp& operand,
+            tensorflow::gtl::ArraySlice<int64> start_indices,
+            tensorflow::gtl::ArraySlice<int64> limit_indices,
+            tensorflow::gtl::ArraySlice<int64> strides);
+
+// Enqueues a slice operation in a given dimension, taking all other
+// dimensions as they are; e.g. if dimno is 1 from start_index 2 to
+// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
+// for:
+//
+//  array[:, 2:4:1, :]
+XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
+                 int64 stride, int64 dimno);
+
+// Enqueues a slice operation onto the computation that slices the 'operand'
+// from dynamic start indices which are passed in 'start_indices'.
+// The size of the slice in each dimension is passed in 'slice_sizes',
+// which specify the end point of exclusive slice intervals in each
+// dimension [start, start + size).
+// The shape of 'start_indices' must be rank == 1, with dimension size
+// equal to the rank of the 'operand'.
+// Slice index calculations are computed modulo input dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
+                   tensorflow::gtl::ArraySlice<int64> slice_sizes);
+
+// Enqueues a dynamic update slice operation onto the computation, which
+// updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
+// The shape of 'update' determines the shape of the slice of 'operand'
+// which is updated.
+// The indices specified in 'start_indices' specify the offset of the slice
+// of 'operand' which is updated.
+//
+//               update = {10, 11} // calculated at runtime.
+//   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
+//   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
+//   [7 8 9]                                                  [7 8  9 ]
+//
+// The shape of 'start_indices' must be rank == 1, with dimension size
+// equal to the rank of the 'operand'.
+// Slice index calculations are computed modulo update dimension sizes to
+// prevent dynamic start indices from generating out-of-bound array accesses.
+XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
+                         const XlaOp& start_indices);
+
+// Enqueues a concatenate instruction onto the computation. 'operands' must
+// have >= 1 entry.
+XlaOp ConcatInDim(XlaBuilder* builder,
+                  tensorflow::gtl::ArraySlice<XlaOp> operands, int64 dimension);
+
+// Enqueue a tracing operation onto the computation; the computation will emit
+// a logging message with the operand.
+void Trace(const string& tag, const XlaOp& operand);
+
+// Enqueues a conditional-move-like select operation onto the computation;
+// predicated on pred, selects between on_true and on_false.
+XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
+
+// Enqueues a tuple-creation instruction onto the computation.
+XlaOp Tuple(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> elements);
+
+// Enqueues a tuple-element-get instruction onto the computation.
+XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
+
+// Enqueues an equal-to comparison instruction onto the computation.
+XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a not-equal comparison instruction onto the computation.
+XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a greater-or-equal comparison instruction onto the computation.
+XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a greater-than comparison instruction onto the computation.
+XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a less-than comparison instruction onto the computation.
+XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a less-or-equal comparison instruction onto the computation.
+XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a dot instruction onto the computation.
+XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
+
+// Enqueues a general dot instruction onto the computation.
+XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                 const DotDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, which uses the
+// default convolution dimension numbers.
+XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
+           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration in the format returned by MakePadding().
+XlaOp ConvWithGeneralPadding(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided dimension numbers configuration.
+XlaOp ConvWithGeneralDimensions(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
+    const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration as well as the dimension numbers.
+XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
+                  tensorflow::gtl::ArraySlice<int64> window_strides,
+                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+                  const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues a convolution instruction onto the computation, with the caller
+// provided padding configuration, dilation factors and dimension numbers.
+XlaOp ConvGeneralDilated(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
+    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers);
+
+// Enqueues an FFT instruction onto the computation, of the given type and
+// with the given FFT length.
+XlaOp Fft(const XlaOp& operand, FftType fft_type,
+          tensorflow::gtl::ArraySlice<int64> fft_length);
+
+// Enqueues an infeed instruction onto the computation, which writes data of
+// the given shape to the infeed buffer of the device.
+XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
+             const string& config = "");
+
+// Variant of Infeed which takes a token-shaped operand and produces a
+// two-element tuple containing the data value and a token-shaped value.
+// Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
+                      const string& config = "");
+
+// Enqueues an outfeed instruction onto the computation. This instruction
+// generates outgoing data transfers for the given data.
+//
+// shape_with_layout communicates the laid out shape that we want to outfeed
+// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
+// will occur.
+void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
+             const string& outfeed_config);
+
+// Variant of Outfeed which takes a token-shaped operand and produces a
+// token-shaped value. Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
+                       const Shape& shape_with_layout,
+                       const string& outfeed_config);
+
+// Enqueues a call instruction onto the computation.
+XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
+           tensorflow::gtl::ArraySlice<XlaOp> operands);
+
+// Enqueues a custom call instruction onto the computation.
+// During code generation, a call instruction is emitted which targets a
+// symbol with the name |call_target_name|.  The |operands| are passed to the
+// call instruction.  |shape| is the resultant shape.
+XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
+                 tensorflow::gtl::ArraySlice<XlaOp> operands,
+                 const Shape& shape);
+
+// Enqueues a pseudo-op to represent host-side computation data-dependencies.
+// During code generation, host send and receive operations will be generated
+// to transfer |operands| to the host and a single result of |shape| back to
+// the device.  Host send/recv operations are emitted using |channel_name|.
+// Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
+// instruction scheduling.
+XlaOp HostCompute(XlaBuilder* builder,
+                  tensorflow::gtl::ArraySlice<XlaOp> operands,
+                  const string& channel_name, int64 cost_estimate_ns,
+                  const Shape& shape);
+
+// The following methods enqueue element-wise binary arithmetic operations
+// onto the computation. The shapes of the operands have to match unless one
+// of the operands is a scalar, or an explicit broadcast dimension is given
+// (see g3doc for more details).
+
+// Enqueues a complex compose instruction onto the computation.
+XlaOp Complex(const XlaOp& real, const XlaOp& imag,
+              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a complex conjugate instruction onto the computation.
+XlaOp Conj(const XlaOp& operand);
+
+// Enqueues an add instruction onto the computation.
+XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a subtract instruction onto the computation.
+XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a multiply instruction onto the computation.
+XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a divide instruction onto the computation.
+XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a remainder instruction onto the computation.
+XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a max instruction onto the computation.
+XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues a min instruction onto the computation.
+XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Element-wise logical operators
+XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
+         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+XlaOp Not(const XlaOp& operand);
+
+XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
+                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+XlaOp ShiftRightArithmetic(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+XlaOp ShiftRightLogical(
+    const XlaOp& lhs, const XlaOp& rhs,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Reduces an array among the provided dimensions, given "computation" as a
+// reduction operator.
+XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
+             const XlaComputation& computation,
+             tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+
+// Convenience wrapper around the above that reduces all the dimensions in the
+// operand shape.
+XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
+                const XlaComputation& computation);
+
+// Enqueues a windowed reduce instruction onto the computation.
+XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
+                   const XlaComputation& computation,
+                   tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                   tensorflow::gtl::ArraySlice<int64> window_strides,
+                   Padding padding);
+
+// As ReduceWindow(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp ReduceWindowWithGeneralPadding(
+    const XlaOp& operand, const XlaOp& init_value,
+    const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
+
+// Returns the sum of the operand value within each subgroup of replicas. All
+// replicas supply one input to the sum and all replicas receive the resulting
+// sum for each subgroup.
+XlaOp CrossReplicaSum(
+    const XlaOp& operand,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
+
+// Enqueues an operation that do an AllReduce of the operand cross cores. Here
+// AllReduce means doing a reduction on the input operand cross cores and then
+// broadcasting the reduction result to those cores. The reduction function is
+// defined by `computation`, which should be a commutative computation on
+// scalars, e.g., add, min, or max. The way that AllReduce is applied is
+// configured by:
+//
+// - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+// replicas belong to one group. Allreduce will be applied within subgroups.
+// For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+// replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+//
+// - `channel_id`: for Allreduce nodes from different models, if they have the
+// same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+// applied cross models.
+//
+// TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
+                      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+                      const tensorflow::gtl::optional<ChannelHandle>&
+                          channel_id = tensorflow::gtl::nullopt);
+
+// Enqueues an operation that scatters the `source` array to the selected
+// indices of each window.
+XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
+                       tensorflow::gtl::ArraySlice<int64> window_dimensions,
+                       tensorflow::gtl::ArraySlice<int64> window_strides,
+                       Padding padding, const XlaOp& source,
+                       const XlaOp& init_value, const XlaComputation& scatter);
+
+// As SelectAndScatter(), but the padding is given in the format
+// returned by MakePadding().
+XlaOp SelectAndScatterWithGeneralPadding(
+    const XlaOp& operand, const XlaComputation& select,
+    tensorflow::gtl::ArraySlice<int64> window_dimensions,
+    tensorflow::gtl::ArraySlice<int64> window_strides,
+    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
+    const XlaOp& source, const XlaOp& init_value,
+    const XlaComputation& scatter);
+
+// Enqueues an abs instruction onto the computation.
+XlaOp Abs(const XlaOp& operand);
+
+// Enqueues a atan2 instruction onto the computation.
+XlaOp Atan2(const XlaOp& y, const XlaOp& x,
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues an exp instruction onto the computation.
+XlaOp Exp(const XlaOp& operand);
+
+// Enqueues an expm1 instruction onto the computation.
+XlaOp Expm1(const XlaOp& operand);
+
+// Enqueues a floor instruction onto the computation.
+XlaOp Floor(const XlaOp& operand);
+
+// Enqueues a ceil instruction onto the computation.
+XlaOp Ceil(const XlaOp& operand);
+
+// Enqueues a round instruction onto the computation, rounding to nearest even
+// with half-way cases rounding away from zero.
+XlaOp Round(const XlaOp& operand);
+
+// Enqueues an log instruction (natural logarithm) onto the computation.
+XlaOp Log(const XlaOp& operand);
+
+// Enqueues an log1p instruction (log(x+1)) onto the computation.
+XlaOp Log1p(const XlaOp& operand);
+
+// Enqueues a sign instruction onto the computation.
+XlaOp Sign(const XlaOp& operand);
+
+// Enqueues a count leading zeros instruction onto the computation.
+XlaOp Clz(const XlaOp& operand);
+
+// Enqueues a cosine instruction onto the computation.
+XlaOp Cos(const XlaOp& operand);
+
+// Enqueues a sine instruction onto the computation.
+XlaOp Sin(const XlaOp& operand);
+
+// Enqueues a tanh instruction onto the computation.
+XlaOp Tanh(const XlaOp& operand);
+
+// Enqueues a real-part instruction onto the computation.
+XlaOp Real(const XlaOp& operand);
+
+// Enqueues an imaginary-part instruction onto the computation.
+XlaOp Imag(const XlaOp& operand);
+
+// Enqueues a lhs^rhs computation onto the computation.
+XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
+          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
+
+// Enqueues an operator that tests if the operand's values are finite, i.e.,
+// not Inf or NaN. Defined only for floating-point types. Returns an array of
+// booleans with the same shape where entries are true iff the corresponding
+// entry was NaN.
+XlaOp IsFinite(const XlaOp& operand);
+
+// Enqueues a convert instruction onto the computation that changes the
+// element type of the operand array to primitive_type.
+XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type);
+
+// Enqueues a no-op instruction onto the computation that changes
+// the element type of the operand array to primitive_type. The
+// bit-widths of the source and destination element types must be
+// identical.
+XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type);
+
+// Enqueues a negate instruction onto the computation.
+XlaOp Neg(const XlaOp& operand);
+
+// Enqueues a transpose instruction onto the computation.
+XlaOp Transpose(const XlaOp& operand,
+                tensorflow::gtl::ArraySlice<int64> permutation);
+
+// Enqueues a reverse instruction onto the computation. The order of the
+// elements in the given dimensions is reversed (i.e., the element at index i
+// is moved to index dimension_size - 1 - i).
+XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions);
+
+// Enqueues a sort (as increasing order) instruction onto the computation.
+// If only keys are provided:
+// * If the keys are an rank-1 tensor (an array), the result is a sorted array
+// of keys, in ascending order.
+// * If the keys have higher rank, the keys are sorted along the provided
+// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
+// value of 0 will indepenently sort every column, and a dimension value of 1
+// will independently sort each row. If no dimension number is provided, then
+// the last dimension is chosen by default.
+//
+// If both keys and values are provided:
+// * The keys and the values must tensors with the same dimensions. The
+// element types of the tensors may be different.
+// * The result is a tuple that consists of a sorted tensor of keys (along the
+// provided dimension, as above) as the first element, and a tensor with their
+// corresponding values as the second element.
+XlaOp Sort(XlaOp keys,
+           tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
+           int64 dimension = -1);
+
+// Enqueues a clamp instruction onto the computation.
+XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
+
+// Enqueues a map instruction onto the computation.
+XlaOp Map(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> operands,
+          const XlaComputation& computation,
+          tensorflow::gtl::ArraySlice<int64> dimensions,
+          tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
+
+// Enqueues a N(mu, sigma) random number generation instruction onto the
+// computation.
+XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
+
+// Enqueues a U(a, b) random number generation instruction onto the
+// computation. Returns values in the semi-open interval [a, b).
+XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
+
+// Enqueues a while node onto the computation.
+XlaOp While(const XlaComputation& condition, const XlaComputation& body,
+            const XlaOp& init);
+
+// Enqueues a conditional node onto the computation.
+XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
+                  const XlaComputation& true_computation,
+                  const XlaOp& false_operand,
+                  const XlaComputation& false_computation);
+
+// Enqueues a ReducePrecision node onto the computation.
+XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
+                      const int mantissa_bits);
+
+// Enqueues a Gather node onto the computation.
+XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
+             const GatherDimensionNumbers& dimension_numbers,
+             tensorflow::gtl::ArraySlice<int64> window_bounds);
+
+// Enqueues a Send node onto the computation for device-to-device
+// communication. This operation sends the given operand to
+// a Recv instruction in a different computation that shares the same channel
+// handle.
+void Send(const XlaOp& operand, const ChannelHandle& handle);
+
+// Variant of Send which takes a token-shaped operand and produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
+                    const ChannelHandle& handle);
+
+// Enqueues a Recv node onto the computation for device-to-device
+// communication. The data comes from a Send instruction in a different
+// computation that shares the same channel handle and its shape must be the
+// same as the given shape.
+XlaOp Recv(XlaBuilder* builder, const Shape& shape,
+           const ChannelHandle& handle);
+
+// Variant of Recv which takes a token-shaped operand and produces a two-element
+// tuple containing the data value and a token-shaped value. Tokens are used
+// for ordering side-effecting operations.
+// TODO(b/110532604): Replace all uses of the non-token form with this variant.
+XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
+                    const ChannelHandle& handle);
+
+// Enqueues a Send node which transfers data from the device to the host. The
+// 'shape_with_layout' argument defines the layout of the data transferred; its
+// shape must be compatible with the shape of the operand. The operand must be
+// array-shaped.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
+                 const Shape& shape_with_layout, const ChannelHandle& handle);
+
+// Enqueues a Recv node which transfers data from the host to the device. The
+// given shape must contain a layout and must be an array.
+// TODO(b/111544877): Support tuple shapes.
+XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
+                   const ChannelHandle& handle);
+
+// Enqueues an operation (AfterAll) with no operands that produces a
+// token-shaped value.  Tokens are used for ordering side-effecting operations.
+// This is a separate method from AfterAll to facility the removal of
+// operand-less AfterAll instructions.
+// TODO(b/110532604): Remove this function when all tokens are derived from a
+// single token generated or passed into the entry computation.
+XlaOp CreateToken(XlaBuilder* builder);
+
+// Enqueues an AfterAll instruction which produces a token-shaped value and
+// takes a variadic number of token-shaped operands. The number of operands must
+// be greater than zero. Used for joining tokens.
+XlaOp AfterAll(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> tokens);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
+// is the normalized result and batch_mean and batch_var are the mean and
+// variance, respectively, across batch for the operand.
+XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
+                        const XlaOp& offset, float epsilon,
+                        int64 feature_index);
+
+// Normalizes operand across spatial and batch dimensions for each feature.
+//
+// `BatchNormInference` is equivalent to calling `BatchNormTraining` without
+// computing `mean` and `variance` for each batch inside the operation. It
+// uses the input `mean` and `variance` instead as estimated values. The
+// purpose of this op is to reduce latency in inference, hence the name
+// `BatchNormInference`.
+//
+// The output has the same shape as `operand`, and contains the normalized
+// values for each batch.
+XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
+                         const XlaOp& offset, const XlaOp& mean,
+                         const XlaOp& variance, float epsilon,
+                         int64 feature_index);
+
+// Calculates the gradients of a batch norm op.
+//
+// The inputs `batch_mean` and `batch_var` represent the mean and variance
+// across the batch.
+//
+// Returns a tuple of three elements:
+//   - grad_operand: Gradient with respect to input `operand`
+//   - grad_offset: Gradient with respect to input `offset`
+//   - grad_scale: Gradient with respect to input `scale`
+XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
+                    const XlaOp& batch_mean, const XlaOp& batch_var,
+                    const XlaOp& grad_output, float epsilon,
+                    int64 feature_index);
+
+// Implementation details below this point.
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR0(NativeT value) {
+  return ConstantLiteral(*LiteralUtil::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(*LiteralUtil::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(literal);
+}
+
+inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(*LiteralUtil::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2(
+    std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(*LiteralUtil::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
+                                              const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
+  return ConstantLiteral(*LiteralUtil::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
+    const Array2D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
+  return ConstantLiteral(*LiteralUtil::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
+    const Array3D<NativeT>& values, const Layout& layout) {
+  return ConstantLiteral(
+      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
+    const Array4D<NativeT>& values, const Layout& layout) {
+  return ConstantFromArrayWithLayout(values, layout);
+}
+
+template <typename NativeT>
+XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
+  return ConstantFromArray(values);
+}
+
+// Free function template implementations.
+
+template <typename NativeT>
+XlaOp ConstantR0(XlaBuilder* builder, NativeT value) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR0<NativeT>(value));
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder,
+                 tensorflow::gtl::ArraySlice<NativeT> values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR1<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value) {
+  Literal literal(ShapeUtil::MakeShape(
+      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
+  literal.PopulateWithValue(value);
+  return ConstantLiteral(builder, literal);
+}
+
+inline XlaOp ConstantR1(XlaBuilder* builder,
+                        const tensorflow::core::Bitmap& values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR1(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2(XlaBuilder* builder,
+                 std::initializer_list<std::initializer_list<NativeT>> values) {
+  return ConstantLiteral(builder, *LiteralUtil::CreateR2<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
+                                  const Array<NativeT>& values,
+                                  const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         *LiteralUtil::CreateFromArray<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
+                                      const Array2D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
+                            const Array2D<NativeT>& values) {
+  return ConstantLiteral(builder,
+                         *LiteralUtil::CreateR2FromArray2D<NativeT>(values));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
+                                      const Array3D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantLiteral(
+      builder,
+      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
+}
+
+template <typename NativeT>
+XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
+                            const Array3D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
+                                      const Array4D<NativeT>& values,
+                                      const Layout& layout) {
+  return ConstantFromArrayWithLayout(builder, values, layout);
+}
+
+template <typename NativeT>
+XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
+                            const Array4D<NativeT>& values) {
+  return ConstantFromArray(builder, values);
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
similarity index 99%
rename from tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
rename to tensorflow/compiler/xla/client/xla_builder_test.cc
index 3b8beb2c7840e23752b5f47bbc5f55d89751884d..28a207b137d901213ec43d506a638ef08a6bded9 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #include <string>
 
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index 763653c685cb0d821c11bf3e25f526db0dcb4945..2e131dbad26970d4cb9860c17c3de3d52de36223 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -23,57 +23,11 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
-cc_library(
-    name = "xla_computation",
-    srcs = ["xla_computation.cc"],
-    hdrs = ["xla_computation.h"],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-    ],
-)
-
 cc_library(
     name = "xla_builder",
-    srcs = ["xla_builder.cc"],
     hdrs = ["xla_builder.h"],
     visibility = ["//visibility:public"],
     deps = [
-        ":xla_computation",
-        "//tensorflow/compiler/xla:execution_options_util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client:sharding_builder",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:shape_inference",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "xla_builder_test",
-    srcs = ["xla_builder_test.cc"],
-    deps = [
-        ":xla_builder",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/core:test",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 445c1e0d77081fea246c2c2c44f7565acb0f30db..ce2a8afd4cb1e7037e68a02670af707f3ff9252c 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -16,2214 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 
-#include <map>
-#include <string>
-#include <type_traits>
-#include <utility>
-
-#include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stacktrace.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-class XlaBuilder;
-
-// This represents an instruction that has been enqueued using the XlaBuilder.
-// This is used to pass to subsequent computations that depends upon the
-// instruction as an operand.
-class XlaOp {
- public:
-  XlaOp() : handle_(-1), builder_(nullptr) {
-    static_assert(std::is_trivially_destructible<XlaOp>::value,
-                  "XlaOp should be trivially destructible");
-  }
-  ~XlaOp() = default;
-
-  XlaBuilder* builder() const { return builder_; }
-
-  // Returns true if the XlaOp represents valid, non-erroneous value.
-  bool valid() const { return handle_ >= 0; }
-
-  // Returns true if the XlaOp was created by the XlaOp() constructor and
-  // not returned by a builder.
-  bool IsUninitialized() const { return builder_ == nullptr; }
-
-  bool IsIdenticalTo(const XlaOp& rhs) const {
-    return handle_ == rhs.handle_ && builder_ == rhs.builder_;
-  }
-
-  friend std::ostream& operator<<(std::ostream& out, const XlaOp& op) {
-    out << op.handle();
-    return out;
-  }
-
- private:
-  explicit XlaOp(XlaBuilder* builder) : handle_(-1), builder_(builder) {}
-  XlaOp(int64 handle, XlaBuilder* builder)
-      : handle_(handle), builder_(builder) {}
-
-  int64 handle() const { return handle_; }
-
-  friend class XlaBuilder;
-
-  // < 0 means "invalid handle".
-  int64 handle_;
-
-  // Not owned. Non-null for any handle returned by XlaBuilder, even if the
-  // handle is invalid.
-  XlaBuilder* builder_;
-};
-
-// Arithmetic operator overloads for the XlaOp type.
-XlaOp operator-(const XlaOp& x);
-XlaOp operator+(const XlaOp& x, const XlaOp& y);
-XlaOp operator-(const XlaOp& x, const XlaOp& y);
-XlaOp operator*(const XlaOp& x, const XlaOp& y);
-XlaOp operator/(const XlaOp& x, const XlaOp& y);
-XlaOp operator%(const XlaOp& x, const XlaOp& y);
-
-// Bitwise operator overloads for the XlaOp type.
-XlaOp operator~(const XlaOp& x);
-XlaOp operator&(const XlaOp& x, const XlaOp& y);
-XlaOp operator|(const XlaOp& x, const XlaOp& y);
-XlaOp operator^(const XlaOp& x, const XlaOp& y);
-XlaOp operator<<(const XlaOp& x, const XlaOp& y);
-// Performs a right arithmetic shift if 'x' is a signed type, otherwise performs
-// a right logical shift.
-XlaOp operator>>(const XlaOp& x, const XlaOp& y);
-
-// We don't overload the relational operators (==, !=, <, <=, >, >=) because the
-// semantics might be surprising since their result types are usually 'bool'.
-// Further programmers may expect == to be a structural equality.
-// We also choose not to overload any of the mutating operators (e.g., +=, -=)
-// because the semantics might be misleading — XLA computations are immutable.
-
-// A convenient interface for building up computations.
-//
-// Thread-compatible.
-class XlaBuilder {
- public:
-  // computation_name: name to use for the built computation.
-  XlaBuilder(const string& computation_name);
-
-  XlaBuilder(const XlaBuilder&) = delete;
-  XlaBuilder& operator=(const XlaBuilder&) = delete;
-
-  ~XlaBuilder();
-
-  // Returns the computation name.
-  const string& name() const { return name_; }
-
-  // Sets OpMetadata that will be added to all instructions until cleared.
-  //
-  // OpMetadata is often applied to a series of XLA HLO instructions. As a
-  // result, OpMetadata is set on the Computation Builder. All subsequent
-  // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetadata.
-  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
-
-  // Clears the HloMetadata state.
-  void ClearOpMetadata() { metadata_.Clear(); }
-
-  // Sets an OpSharding that will be attached to all instructions until cleared.
-  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
-
-  // Clears the sharding. Ops will be sharded according to the default placement
-  // policy.
-  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
-
-  // Returns the OpSharding that will be attached to all instructions.
-  const tensorflow::gtl::optional<OpSharding>& sharding() const {
-    return sharding_;
-  }
-
-  // Sets the builder to a mode where it will die immediately when an error is
-  // encountered, rather than producing it in a deferred fashion when Build() is
-  // called (which is the default).
-  void set_die_immediately_on_error(bool enabled) {
-    die_immediately_on_error_ = enabled;
-  }
-
-  // Default dimension numbers used for a 2D convolution.
-  static constexpr int64 kConvBatchDimension = 0;
-  static constexpr int64 kConvFeatureDimension = 1;
-  static constexpr int64 kConvFirstSpatialDimension = 2;
-  static constexpr int64 kConvSecondSpatialDimension = 3;
-  static constexpr int64 kConvKernelOutputDimension = 0;
-  static constexpr int64 kConvKernelInputDimension = 1;
-  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
-  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
-
-  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
-  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
-  // the kernel operand
-  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
-  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
-      int num_spatial_dims = 2);
-
-  // Returns an error if the convolution dimension numbers have conflicts.
-  static Status Validate(const ConvolutionDimensionNumbers& dnum);
-
-  // Returns a new XlaBuilder whose resultant Computation is used only by this
-  // XlaBuilder. The sub-XlaBuilder has the same die_immediately_on_error
-  // behavior as the parent.
-  std::unique_ptr<XlaBuilder> CreateSubBuilder(const string& computation_name);
-
-  // Builds the computation with the requested operations, or returns a non-ok
-  // status. Note that all ops that have been enqueued will be moved to the
-  // computation being returned.
-  StatusOr<XlaComputation> Build();
-
-  // Builds the computation with the requested operations, or notes an error in
-  // the parent XlaBuilder and returns an empty computation if building failed.
-  // This function is intended to be used where the returned XlaComputation is
-  // only used by the parent XlaBuilder and hence further operation on the
-  // returned XlaComputation will simply be error'ed out if an error occurred
-  // while building this computation. If the built computation is to be used by
-  // a XlaBuilder other than the parent XlaBuilder then Build() should be used
-  // instead.
-  XlaComputation BuildAndNoteError();
-
-  // Returns a subgraph that roots on the given root. If the root is not a
-  // compile-time constant (see `IsConstant`), returns an error.
-  //
-  // This will copy the needed ops/computations to the subgraph.
-  StatusOr<XlaComputation> BuildConstantSubGraph(const XlaOp& root_op) const;
-
-  // Returns the first error that was encountered while building the
-  // computation. When an error is encountered, by default we return a vacuous
-  // XlaOp and inform the user of the error that occurred while
-  // building the computation when they make a final call to Build().
-  //
-  // See also set_die_immediately_on_error().
-  Status first_error() const { return first_error_; }
-
-  // Returns the shape of the given op.
-  StatusOr<Shape> GetShape(const XlaOp& op) const;
-
-  // Returns the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape() const;
-
-  // Reports an error to the builder, by
-  // * storing it internally and capturing a backtrace if it's the first error
-  //   (this deferred value will be produced on the call to
-  //    Build()/GetShape()/...)
-  // * dying if die_immediately_on_error_ is true.
-  // Returns an XlaOp with an invalid handle but a valid builder. This value can
-  // be returned in place of a value in APIs that return an XlaOp.
-  XlaOp ReportError(const Status& error);
-
-  // A helper function that converts a StatusOr<XlaOp> into an XlaOp.
-  // If the Status was an error, reports the error to builder and returns an
-  // invalid XlaOp handle.
-  XlaOp ReportErrorOrReturn(const StatusOr<XlaOp>& op);
-
-  // A helper function that runs a function that returns a StatusOr<XlaOp> and
-  // returns an XlaOp.
-  XlaOp ReportErrorOrReturn(const std::function<StatusOr<XlaOp>()>& op_creator);
-
-  // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on any parameters, or on stateful operators such
-  // as `RngNormal` or `Infeed`.
-  //
-  // This tests whether a computation is a compile-time constant without
-  // evaluating the computation.
-  StatusOr<bool> IsConstant(const XlaOp& operand) const;
-
- private:
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
-  XlaOp Parameter(int64 parameter_number, const Shape& shape,
-                  const string& name);
-
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
-  XlaOp ConstantLiteral(const LiteralSlice& literal);
-
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
-  template <typename NativeT>
-  XlaOp ConstantR0(NativeT value);
-  template <typename NativeT>
-  XlaOp ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  XlaOp ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  XlaOp ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  XlaOp ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                    const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2DWithLayout(const Array2D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3DWithLayout(const Array3D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4DWithLayout(const Array4D<NativeT>& values,
-                                        const Layout& layout);
-  template <typename NativeT>
-  XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
-  template <typename NativeT>
-  XlaOp ConstantR1(int64 length, NativeT value);
-
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-  XlaOp Broadcast(const XlaOp& operand,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  // Performs in-dimension-style broadcast.
-  //
-  // Operand specifies the input to be broadcast. "shape" is expected output
-  // shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-  // Dimension numbers in broadcast_dimensions map to individual dimensions
-  // of the operand, and specify what dimension of the output shape they
-  // should be broadcast.
-  // e.g.
-  // Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-  // and dimension of shape is [2,2].
-  // Specifying {1} as brodcast_dimension will generate output
-  // [1 , 2]
-  // [1 , 2]
-  // On the other hand, specifying {0} as broadcast_dimension
-  // will generate output
-  // [1 , 1]
-  // [2 , 2]
-  XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
-      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
-  XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-            const PaddingConfig& padding_config);
-
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
-  XlaOp Reshape(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> dimensions,
-                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
-  XlaOp Reshape(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
-  XlaOp Collapse(const XlaOp& operand,
-                 tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
-  XlaOp Slice(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> start_indices,
-              tensorflow::gtl::ArraySlice<int64> limit_indices,
-              tensorflow::gtl::ArraySlice<int64> strides);
-
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
-  XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
-                   int64 stride, int64 dimno);
-
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                     tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                           const XlaOp& start_indices);
-
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
-  XlaOp ConcatInDim(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                    int64 dimension);
-
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
-  void Trace(const string& tag, const XlaOp& operand);
-
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
-  XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
-
-  // Enqueues a tuple-creation instruction onto the computation.
-  XlaOp Tuple(tensorflow::gtl::ArraySlice<XlaOp> elements);
-
-  // Enqueues a tuple-element-get instruction onto the computation.
-  XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-
-  // Enqueues an equal-to comparison instruction onto the computation.
-  XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a not-equal comparison instruction onto the computation.
-  XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
-  XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-than comparison instruction onto the computation.
-  XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-than comparison instruction onto the computation.
-  XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-or-equal comparison instruction onto the computation.
-  XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a dot instruction onto the computation.
-  XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-
-  // Enqueues a general dot instruction onto the computation.
-  XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                   const DotDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
-  XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-             tensorflow::gtl::ArraySlice<int64> window_strides,
-             Padding padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
-  XlaOp ConvWithGeneralPadding(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
-  XlaOp ConvWithGeneralDimensions(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
-  XlaOp ConvGeneral(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
-  XlaOp ConvGeneralDilated(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
-  XlaOp Fft(const XlaOp& operand, FftType fft_type,
-            tensorflow::gtl::ArraySlice<int64> fft_length);
-
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
-  XlaOp Infeed(const Shape& shape, const string& config = "");
-  XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                        const string& config = "");
-
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
-  void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-               const string& outfeed_config);
-  XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                         const Shape& shape_with_layout,
-                         const string& outfeed_config);
-
-  // Enqueues a call instruction onto the computation.
-  XlaOp Call(const XlaComputation& computation,
-             tensorflow::gtl::ArraySlice<XlaOp> operands);
-
-  // Enqueues a custom call instruction onto the computation.
-  // During code generation, a call instruction is emitted which targets a
-  // symbol with the name |call_target_name|.  The |operands| are passed to the
-  // call instruction.  |shape| is the resultant shape.
-  XlaOp CustomCall(const string& call_target_name,
-                   tensorflow::gtl::ArraySlice<XlaOp> operands,
-                   const Shape& shape);
-
-  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
-  // During code generation, host send and receive operations will be generated
-  // to transfer |operands| to the host and a single result of |shape| back to
-  // the device.  Host send/recv operations are emitted using |channel_name|.
-  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-  // instruction scheduling.
-  XlaOp HostCompute(tensorflow::gtl::ArraySlice<XlaOp> operands,
-                    const string& channel_name, int64 cost_estimate_ns,
-                    const Shape& shape);
-
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
-  XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a complex conjugate instruction onto the computation.
-  XlaOp Conj(const XlaOp& operand);
-
-  // Enqueues an add instruction onto the computation.
-  XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a subtract instruction onto the computation.
-  XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a multiply instruction onto the computation.
-  XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a divide instruction onto the computation.
-  XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a remainder instruction onto the computation.
-  XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a max instruction onto the computation.
-  XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a min instruction onto the computation.
-  XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Element-wise logical operators
-  XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  XlaOp Not(const XlaOp& operand);
-
-  XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  XlaOp ShiftRightArithmetic(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  XlaOp ShiftRightLogical(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
-  XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-               const XlaComputation& computation,
-               tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
-  XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                  const XlaComputation& computation);
-
-  // Enqueues a windowed reduce instruction onto the computation.
-  XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
-                     const XlaComputation& computation,
-                     tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                     tensorflow::gtl::ArraySlice<int64> window_strides,
-                     Padding padding);
-
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
-  XlaOp ReduceWindowWithGeneralPadding(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
-  XlaOp CrossReplicaSum(
-      const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
-
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-  // replicas belong to one group. Allreduce will be applied within subgroups.
-  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different models, if they have the
-  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-  // applied cross models.
-  //
-  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
-  XlaOp CrossReplicaSum(
-      const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
-          tensorflow::gtl::nullopt);
-
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
-  XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
-                         tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                         tensorflow::gtl::ArraySlice<int64> window_strides,
-                         Padding padding, const XlaOp& source,
-                         const XlaOp& init_value,
-                         const XlaComputation& scatter);
-
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
-  XlaOp SelectAndScatterWithGeneralPadding(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-
-  // Enqueues an abs instruction onto the computation.
-  XlaOp Abs(const XlaOp& operand);
-
-  // Enqueues a atan2 instruction onto the computation.
-  XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an exp instruction onto the computation.
-  XlaOp Exp(const XlaOp& operand);
-
-  // Enqueues an expm1 instruction onto the computation.
-  XlaOp Expm1(const XlaOp& operand);
-
-  // Enqueues a floor instruction onto the computation.
-  XlaOp Floor(const XlaOp& operand);
-
-  // Enqueues a ceil instruction onto the computation.
-  XlaOp Ceil(const XlaOp& operand);
-
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
-  XlaOp Round(const XlaOp& operand);
-
-  // Enqueues an log instruction (natural logarithm) onto the computation.
-  XlaOp Log(const XlaOp& operand);
-
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
-  XlaOp Log1p(const XlaOp& operand);
-
-  // Enqueues a sign instruction onto the computation.
-  XlaOp Sign(const XlaOp& operand);
-
-  // Enqueues a count leading zeros instruction onto the computation.
-  XlaOp Clz(const XlaOp& operand);
-
-  // Enqueues a cosine instruction onto the computation.
-  XlaOp Cos(const XlaOp& operand);
-
-  // Enqueues a sine instruction onto the computation.
-  XlaOp Sin(const XlaOp& operand);
-
-  // Enqueues a tanh instruction onto the computation.
-  XlaOp Tanh(const XlaOp& operand);
-
-  // Enqueues a real-part instruction onto the computation.
-  XlaOp Real(const XlaOp& operand);
-
-  // Enqueues an imaginary-part instruction onto the computation.
-  XlaOp Imag(const XlaOp& operand);
-
-  // Enqueues a lhs^rhs computation onto the computation.
-  XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
-  XlaOp IsFinite(const XlaOp& operand);
-
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
-  XlaOp ConvertElementType(const XlaOp& operand,
-                           PrimitiveType new_element_type);
-
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
-  XlaOp BitcastConvertType(const XlaOp& operand,
-                           PrimitiveType new_element_type);
-
-  // Enqueues a negate instruction onto the computation.
-  XlaOp Neg(const XlaOp& operand);
-
-  // Enqueues a transpose instruction onto the computation.
-  XlaOp Transpose(const XlaOp& operand,
-                  tensorflow::gtl::ArraySlice<int64> permutation);
-
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
-  XlaOp Rev(const XlaOp& operand,
-            tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and the values must tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and a tensor with their
-  // corresponding values as the second element.
-  XlaOp Sort(XlaOp keys,
-             tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
-             int64 dimension = -1);
-
-  // Enqueues a clamp instruction onto the computation.
-  XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-
-  // Enqueues a map instruction onto the computation.
-  XlaOp Map(tensorflow::gtl::ArraySlice<XlaOp> operands,
-            const XlaComputation& computation,
-            tensorflow::gtl::ArraySlice<int64> dimensions,
-            tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
-
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
-  XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
-
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
-  XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-
-  // Enqueues a while node onto the computation.
-  XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-              const XlaOp& init);
-
-  // Enqueues a conditional node onto the computation.
-  XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                    const XlaComputation& true_computation,
-                    const XlaOp& false_operand,
-                    const XlaComputation& false_computation);
-
-  // Enqueues a ReducePrecision node onto the computation.
-  XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                        const int mantissa_bits);
-
-  // Enqueues a Gather node onto the computation.
-  XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-               const GatherDimensionNumbers& dimension_numbers,
-               tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
-  void Send(const XlaOp& operand, const ChannelHandle& handle);
-  XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                      const ChannelHandle& handle);
-
-  // Enqueues a Send node which sends data to the host.
-  XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                   const Shape& shape_with_layout, const ChannelHandle& handle);
-
-  // Enqueues a Recv node which receives data from the host.
-  XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                     const ChannelHandle& handle);
-
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
-  XlaOp CreateToken();
-
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
-  XlaOp AfterAll(tensorflow::gtl::ArraySlice<XlaOp> tokens);
-
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
-  XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
-  XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                      const ChannelHandle& handle);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
-  XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                          const XlaOp& offset, float epsilon,
-                          int64 feature_index);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
-  XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                           const XlaOp& offset, const XlaOp& mean,
-                           const XlaOp& variance, float epsilon,
-                           int64 feature_index);
-
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
-  XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                      const XlaOp& batch_mean, const XlaOp& batch_var,
-                      const XlaOp& grad_output, float epsilon,
-                      int64 feature_index);
-
-  StatusOr<XlaOp> AddInstruction(
-      HloInstructionProto&& instr, HloOpcode opcode,
-      tensorflow::gtl::ArraySlice<XlaOp> operands = {});
-
-  void AddCalledComputation(const XlaComputation& computation,
-                            HloInstructionProto* instr);
-
-  StatusOr<const HloInstructionProto*> LookUpInstruction(const XlaOp& op) const;
-
-  // Internal helper method that does the building for an arbitrary unary op.
-  XlaOp UnaryOp(HloOpcode unop, const XlaOp& operand);
-
-  // Internal helper method that does the building for an arbitrary binary op.
-  // broadcast_dimensions specifies which dimensions to use for broadcasting
-  // when the operation is between tensors of different ranks.
-  XlaOp BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
-                 tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that does the building for an arbitrary ternary op.
-  XlaOp TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
-                  const XlaOp& ehs);
-
-  XlaOp RngOp(RandomDistribution distribution,
-              tensorflow::gtl::ArraySlice<XlaOp> parameters,
-              const Shape& shape);
-
-  StatusOr<XlaOp> InDimBroadcast(
-      const Shape& shape, const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that creates a sequence of instructions that
-  // performs an explicit broadcast of the operand to the target shape.
-  StatusOr<XlaOp> AddBroadcastSequence(const Shape& output_shape,
-                                       const XlaOp& operand);
-
-  // Internal helper method for creating a Reshape op with the already inferred
-  // shape.
-  StatusOr<XlaOp> Reshape(const Shape& shape, const XlaOp& operand);
-
-  // Returns the (inferred) result for the program shape for the current
-  // computation and fills the root_id in the pointer.
-  StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
-
-  // Returns shapes for the operands.
-  StatusOr<std::vector<Shape>> GetOperandShapes(
-      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
-
-  // A visitor which checks whether an operation is a compile-time constant,
-  // meaning that it doesn't depend on any parameters, or on any stateful
-  // operation such as `RngNormal` or `Infeed`. The visitor walks the
-  // computation starting at a given operation and sets is_constant to false iff
-  // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64 op_handle, std::set<int64>* visited,
-                         bool* is_constant) const;
-
-  // Checks bounds for convolution parameters.
-  Status VerifyConvolution(
-      const Shape& lhs_shape, const Shape& rhs_shape,
-      const ConvolutionDimensionNumbers& dimension_numbers) const;
-
-  // Helper function for creating a Window proto from user-supplied data.
-  // Returns error if the user-supplied data was invalid.
-  StatusOr<Window> MakeWindow(
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation) const;
-
-  string name_;  // Name to use for the built computation.
-
-  // The first error encountered while building the computation.
-  // This is OK until the first error is encountered.
-  Status first_error_;
-
-  // The saved stack trace from the point at which the first error occurred.
-  tensorflow::SavedStackTrace first_error_backtrace_;
-
-  // The instructions of this computation.
-  std::vector<HloInstructionProto> instructions_;
-
-  // The embedded computations used by this computation. Each computation was
-  // the entry computation of some XlaComputation, the key is the unique id of
-  // that XlaComputation.
-  std::map<int64, HloComputationProto> embedded_;
-
-  // The unique parameter numbers.
-  tensorflow::gtl::FlatSet<int64> parameter_numbers_;
-
-  // The metadata to attach to each op. This is structured as a "modal"-like
-  // operation, in order to simplify client code (and not sprinkle this metadata
-  // throughout the TensorFlow op kernel implementations).
-  OpMetadata metadata_;
-
-  // Sharding for this operator. This is structured as a "model"-like operation,
-  // in order to simplify client code, similar to metadata_.
-  tensorflow::gtl::optional<OpSharding> sharding_;
-
-  // Mode bit that indicates whether to die when a first error is encountered.
-  bool die_immediately_on_error_ = false;
-
-  XlaBuilder* parent_builder_{nullptr};
-
-  friend XlaOp Parameter(XlaBuilder* builder, int64 parameter_number,
-                         const Shape& shape, const string& name);
-  friend XlaOp ConstantLiteral(XlaBuilder* builder,
-                               const LiteralSlice& literal);
-  template <typename NativeT>
-  friend XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          tensorflow::gtl::ArraySlice<NativeT> values);
-  friend XlaOp ConstantR1(XlaBuilder* builder,
-                          const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2(
-      XlaBuilder* builder,
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                           const Array<NativeT>& values,
-                                           const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantFromArray(XlaBuilder* builder,
-                                 const Array<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                               const Array2D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                                     const Array2D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                               const Array3D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                                     const Array3D<NativeT>& values);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                               const Array4D<NativeT>& values,
-                                               const Layout& layout);
-  template <typename NativeT>
-  friend XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                                     const Array4D<NativeT>& values);
-
-  template <typename NativeT>
-  friend XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
-
-  friend XlaOp Broadcast(const XlaOp& operand,
-                         tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
-      const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-                   const PaddingConfig& padding_config);
-
-  friend XlaOp Reshape(const XlaOp& operand,
-                       tensorflow::gtl::ArraySlice<int64> dimensions,
-                       tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  friend XlaOp Reshape(const XlaOp& operand,
-                       tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  friend XlaOp Collapse(const XlaOp& operand,
-                        tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  friend XlaOp Slice(const XlaOp& operand,
-                     tensorflow::gtl::ArraySlice<int64> start_indices,
-                     tensorflow::gtl::ArraySlice<int64> limit_indices,
-                     tensorflow::gtl::ArraySlice<int64> strides);
-
-  friend XlaOp SliceInDim(const XlaOp& operand, int64 start_index,
-                          int64 limit_index, int64 stride, int64 dimno);
-
-  friend XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                            tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  friend XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                                  const XlaOp& start_indices);
-
-  friend XlaOp ConcatInDim(XlaBuilder* builder,
-                           tensorflow::gtl::ArraySlice<XlaOp> operands,
-                           int64 dimension);
-
-  friend void Trace(const string& tag, const XlaOp& operand);
-
-  friend XlaOp Select(const XlaOp& pred, const XlaOp& on_true,
-                      const XlaOp& on_false);
-  friend XlaOp Tuple(XlaBuilder* builder,
-                     tensorflow::gtl::ArraySlice<XlaOp> elements);
-  friend XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-  friend XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-  friend XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                          const DotDimensionNumbers& dimension_numbers);
-  friend XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-                    tensorflow::gtl::ArraySlice<int64> window_strides,
-                    Padding padding);
-  friend XlaOp ConvWithGeneralPadding(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-  friend XlaOp ConvWithGeneralDimensions(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp ConvGeneral(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp ConvGeneralDilated(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-  friend XlaOp Fft(const XlaOp& operand, FftType fft_type,
-                   tensorflow::gtl::ArraySlice<int64> fft_length);
-  friend XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
-                      const string& config);
-  friend void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-                      const string& outfeed_config);
-  friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
-                    tensorflow::gtl::ArraySlice<XlaOp> operands);
-  friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                          tensorflow::gtl::ArraySlice<XlaOp> operands,
-                          const Shape& shape);
-  friend XlaOp HostCompute(XlaBuilder* builder,
-                           tensorflow::gtl::ArraySlice<XlaOp> operands,
-                           const string& channel_name, int64 cost_estimate_ns,
-                           const Shape& shape);
-  friend XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-                       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Conj(const XlaOp& operand);
-  friend XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Not(const XlaOp& operand);
-  friend XlaOp ShiftLeft(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp ShiftRightArithmetic(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp ShiftRightLogical(
-      const XlaOp& lhs, const XlaOp& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-                      const XlaComputation& computation,
-                      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-  friend XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                         const XlaComputation& computation);
-  friend XlaOp ReduceWindow(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-  friend XlaOp ReduceWindowWithGeneralPadding(
-      const XlaOp& operand, const XlaOp& init_value,
-      const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-  friend XlaOp CrossReplicaSum(
-      const XlaOp& operand,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids);
-  friend XlaOp CrossReplicaSum(
-      const XlaOp& operand, const XlaComputation& computation,
-      tensorflow::gtl::ArraySlice<int64> replica_group_ids,
-      const tensorflow::gtl::optional<ChannelHandle>& channel_id);
-  friend XlaOp SelectAndScatter(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-  friend XlaOp SelectAndScatterWithGeneralPadding(
-      const XlaOp& operand, const XlaComputation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const XlaOp& source, const XlaOp& init_value,
-      const XlaComputation& scatter);
-  friend XlaOp Abs(const XlaOp& operand);
-  friend XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-                     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp Exp(const XlaOp& operand);
-  friend XlaOp Expm1(const XlaOp& operand);
-  friend XlaOp Floor(const XlaOp& operand);
-  friend XlaOp Ceil(const XlaOp& operand);
-  friend XlaOp Round(const XlaOp& operand);
-  friend XlaOp Log(const XlaOp& operand);
-  friend XlaOp Log1p(const XlaOp& operand);
-  friend XlaOp Sign(const XlaOp& operand);
-  friend XlaOp Clz(const XlaOp& operand);
-  friend XlaOp Cos(const XlaOp& operand);
-  friend XlaOp Sin(const XlaOp& operand);
-  friend XlaOp Tanh(const XlaOp& operand);
-  friend XlaOp Real(const XlaOp& operand);
-  friend XlaOp Imag(const XlaOp& operand);
-  friend XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-                   tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-  friend XlaOp IsFinite(const XlaOp& operand);
-  friend XlaOp ConvertElementType(const XlaOp& operand,
-                                  PrimitiveType new_element_type);
-  friend XlaOp BitcastConvertType(const XlaOp& operand,
-                                  PrimitiveType new_element_type);
-  friend XlaOp Neg(const XlaOp& operand);
-  friend XlaOp Transpose(const XlaOp& operand,
-                         tensorflow::gtl::ArraySlice<int64> permutation);
-  friend XlaOp Rev(const XlaOp& operand,
-                   tensorflow::gtl::ArraySlice<int64> dimensions);
-  friend XlaOp Sort(XlaOp keys, tensorflow::gtl::optional<XlaOp> values,
-                    int64 dimension);
-  friend XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-  friend XlaOp Map(XlaBuilder* builder,
-                   tensorflow::gtl::ArraySlice<XlaOp> operands,
-                   const XlaComputation& computation,
-                   tensorflow::gtl::ArraySlice<int64> dimensions,
-                   tensorflow::gtl::ArraySlice<XlaOp> static_operands);
-  friend XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma,
-                         const Shape& shape);
-  friend XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-  friend XlaOp While(const XlaComputation& condition,
-                     const XlaComputation& body, const XlaOp& init);
-  friend XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                           const XlaComputation& true_computation,
-                           const XlaOp& false_operand,
-                           const XlaComputation& false_computation);
-  friend XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                               const int mantissa_bits);
-  friend XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-                      const GatherDimensionNumbers& dimension_numbers,
-                      tensorflow::gtl::ArraySlice<int64> window_bounds);
-  friend void Send(const XlaOp& operand, const ChannelHandle& handle);
-  friend XlaOp Recv(XlaBuilder* builder, const Shape& shape,
-                    const ChannelHandle& handle);
-  friend XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                                 const XlaOp& offset, float epsilon,
-                                 int64 feature_index);
-  friend XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                                  const XlaOp& offset, const XlaOp& mean,
-                                  const XlaOp& variance, float epsilon,
-                                  int64 feature_index);
-  friend XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                             const XlaOp& batch_mean, const XlaOp& batch_var,
-                             const XlaOp& grad_output, float epsilon,
-                             int64 feature_index);
-  friend XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                             const ChannelHandle& handle);
-  friend XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                             const ChannelHandle& handle);
-  friend XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                          const Shape& shape_with_layout,
-                          const ChannelHandle& handle);
-  friend XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                            const ChannelHandle& handle);
-  friend XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                               const string& config);
-  friend XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                                const Shape& shape_with_layout,
-                                const string& outfeed_config);
-  friend XlaOp CreateToken(XlaBuilder* builder);
-  friend XlaOp AfterAll(XlaBuilder* builder,
-                        tensorflow::gtl::ArraySlice<XlaOp> tokens);
-};
-
-// RAII-style object: sets the current sharding assignment in builder on
-// construction, and sets back to the previous assignment on destruction.
-class XlaScopedShardingAssignment {
- public:
-  XlaScopedShardingAssignment(xla::XlaBuilder* builder,
-                              tensorflow::gtl::optional<OpSharding> sharding)
-      : builder_(builder), prev_sharding_(builder->sharding()) {
-    SetSharding(sharding);
-  }
-
-  XlaScopedShardingAssignment(const XlaScopedShardingAssignment&) = delete;
-  XlaScopedShardingAssignment& operator=(const XlaScopedShardingAssignment&) =
-      delete;
-
-  ~XlaScopedShardingAssignment() { SetSharding(prev_sharding_); }
-
- private:
-  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
-    if (sharding.has_value()) {
-      builder_->SetSharding(sharding.value());
-    } else {
-      builder_->ClearSharding();
-    }
-  }
-
-  xla::XlaBuilder* const builder_;
-  tensorflow::gtl::optional<OpSharding> prev_sharding_;
-};
-
-// Free functions for building XlaOps. The intention is that these will
-// become the public API for building XlaOps rather than calling methods on
-// XlaBuilder directly.
-
-// Enqueues a "retrieve parameter value" instruction for a parameter that was
-// passed to the computation.
-XlaOp Parameter(XlaBuilder* builder, int64 parameter_number, const Shape& shape,
-                const string& name);
-
-// Enqueues a constant with the value of the given literal onto the
-// computation.
-XlaOp ConstantLiteral(XlaBuilder* builder, const LiteralSlice& literal);
-
-// Enqueues a constant onto the computation. Methods are templated on the
-// native host type (NativeT) which corresponds to a specific XLA
-// PrimitiveType as given in the following table:
-//
-//  Native Type   PrimitiveType
-// -----------------------------
-//   bool           PRED
-//   int32          S32
-//   int64          S64
-//   uint32         U32
-//   uint64         U64
-//   float          F32
-//   double         F64
-//
-// Note: not all primitive types defined in xla_data.proto have a
-// corresponding native type yet.
-template <typename NativeT>
-XlaOp ConstantR0(XlaBuilder* builder, NativeT value);
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder,
-                 tensorflow::gtl::ArraySlice<NativeT> values);
-XlaOp ConstantR1(XlaBuilder* builder, const tensorflow::core::Bitmap& values);
-template <typename NativeT>
-XlaOp ConstantR2(XlaBuilder* builder,
-                 std::initializer_list<std::initializer_list<NativeT>> values);
-template <typename NativeT>
-XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                  const Array<NativeT>& values,
-                                  const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                      const Array2D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                            const Array2D<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                      const Array3D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                            const Array3D<NativeT>& values);
-template <typename NativeT>
-XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                      const Array4D<NativeT>& values,
-                                      const Layout& layout);
-template <typename NativeT>
-XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                            const Array4D<NativeT>& values);
-
-// Enqueues a rank one constant (XlaBuilder* builder, vector) onto the
-// computation. The vector has size 'length' and every element has the value
-// 'value'.
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value);
-
-// Adds dimensions to an array by duplicating the data in the array.
-//
-// The new dimensions are inserted on the left, i.e. if
-// broadcast_sizes has values {a0, ..., aN} and the operand shape
-// has dimensions {b0, ..., bM} then the shape of the output has
-// dimensions {a0, ..., aN, b0, ..., bM}.
-//
-// The new dimensions index into copies of the operand, i.e.
-//
-//   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-XlaOp Broadcast(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-// Performs in-dimension-style broadcast.
-//
-// Operand specifies the input to be broadcast. "shape" is expected output
-// shape. "broadcast_dimensions" are the dimensions to be broadcasting into.
-// Dimension numbers in broadcast_dimensions map to individual dimensions
-// of the operand, and specify what dimension of the output shape they
-// should be broadcast.
-// e.g.
-// Say operand = [1, 2], i.e., a 1D tensor with 2 elements.
-// and dimension of shape is [2,2].
-// Specifying {1} as brodcast_dimension will generate output
-// [1 , 2]
-// [1 , 2]
-// On the other hand, specifying {0} as broadcast_dimension
-// will generate output
-// [1 , 1]
-// [2 , 2]
-XlaOp BroadcastInDim(
-    const XlaOp& operand, const Shape& shape,
-    const tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-// Enqueues a pad operation onto the computation that pads the given value on
-// the edges as well as between the elements of the input. padding_config
-// specifies the padding amount for each dimension.
-XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
-          const PaddingConfig& padding_config);
-
-// Enqueues an operation onto the computation that flattens the operand based
-// on the dimension order (major/slowest-varying to minor/fastest-varying)
-// given, followed by reshaping it into the shape with the given dimension
-// sizes (also major to minor). Conceptually, this is a limited form of
-// "shape casting".
-XlaOp Reshape(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> dimensions,
-              tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-// Enqueues an operation onto the computation that collapses the operand, from
-// first to last dimension (C order), then reshapes it to the given dimension
-// sizes. Conceptually, this is a limited form of "shape casting".
-XlaOp Reshape(const XlaOp& operand,
-              tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-// Wrapper for Reshape.
-// Enqueues an operation to collapse the provided dimensions; e.g. an
-// operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-// {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-// be a consecutive, in-order subsequence of the operand dimensions.
-//
-// Note that collapsing a single dimension does nothing:
-//
-//    {256} collapsing {0} => {256}
-//    {1} collapsing {0} => {1}
-//
-// Collapsing multiple dimensions produces a single result dimension:
-//
-//    {256, 2} collapsing {0,1} => {512}
-//    {256, 2, 3} collapsing {0,1} => {512, 3}
-//
-// This could potentially cause data to be moved -- it provides a more
-// structured form of reshaping than an arbitrary Reshape operation.
-XlaOp Collapse(const XlaOp& operand,
-               tensorflow::gtl::ArraySlice<int64> dimensions);
-
-// Enqueues a slice operation onto the computation that slices the operand
-// from the start indices to the limit indices; e.g.
-//
-//        x
-//   [ 0 1 2 3 ]
-// y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-//   [ 8 9 a b ]
-//
-// Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-// range notation.
-// The strides parameter determines the stride over the slice
-XlaOp Slice(const XlaOp& operand,
-            tensorflow::gtl::ArraySlice<int64> start_indices,
-            tensorflow::gtl::ArraySlice<int64> limit_indices,
-            tensorflow::gtl::ArraySlice<int64> strides);
-
-// Enqueues a slice operation in a given dimension, taking all other
-// dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-// limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-// for:
-//
-//  array[:, 2:4:1, :]
-XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
-                 int64 stride, int64 dimno);
-
-// Enqueues a slice operation onto the computation that slices the 'operand'
-// from dynamic start indices which are passed in 'start_indices'.
-// The size of the slice in each dimension is passed in 'slice_sizes',
-// which specify the end point of exclusive slice intervals in each
-// dimension [start, start + size).
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
-// Slice index calculations are computed modulo input dimension sizes to
-// prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
-                   tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-// Enqueues a dynamic update slice operation onto the computation, which
-// updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-// The shape of 'update' determines the shape of the slice of 'operand'
-// which is updated.
-// The indices specified in 'start_indices' specify the offset of the slice
-// of 'operand' which is updated.
-//
-//               update = {10, 11} // calculated at runtime.
-//   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-//   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-//   [7 8 9]                                                  [7 8  9 ]
-//
-// The shape of 'start_indices' must be rank == 1, with dimension size
-// equal to the rank of the 'operand'.
-// Slice index calculations are computed modulo update dimension sizes to
-// prevent dynamic start indices from generating out-of-bound array accesses.
-XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
-                         const XlaOp& start_indices);
-
-// Enqueues a concatenate instruction onto the computation. 'operands' must
-// have >= 1 entry.
-XlaOp ConcatInDim(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands, int64 dimension);
-
-// Enqueue a tracing operation onto the computation; the computation will emit
-// a logging message with the operand.
-void Trace(const string& tag, const XlaOp& operand);
-
-// Enqueues a conditional-move-like select operation onto the computation;
-// predicated on pred, selects between on_true and on_false.
-XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
-
-// Enqueues a tuple-creation instruction onto the computation.
-XlaOp Tuple(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> elements);
-
-// Enqueues a tuple-element-get instruction onto the computation.
-XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
-
-// Enqueues an equal-to comparison instruction onto the computation.
-XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a not-equal comparison instruction onto the computation.
-XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a greater-or-equal comparison instruction onto the computation.
-XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a greater-than comparison instruction onto the computation.
-XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a less-than comparison instruction onto the computation.
-XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a less-or-equal comparison instruction onto the computation.
-XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a dot instruction onto the computation.
-XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs);
-
-// Enqueues a general dot instruction onto the computation.
-XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                 const DotDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, which uses the
-// default convolution dimension numbers.
-XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
-           tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration in the format returned by MakePadding().
-XlaOp ConvWithGeneralPadding(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided dimension numbers configuration.
-XlaOp ConvWithGeneralDimensions(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration as well as the dimension numbers.
-XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
-                  tensorflow::gtl::ArraySlice<int64> window_strides,
-                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues a convolution instruction onto the computation, with the caller
-// provided padding configuration, dilation factors and dimension numbers.
-XlaOp ConvGeneralDilated(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers);
-
-// Enqueues an FFT instruction onto the computation, of the given type and
-// with the given FFT length.
-XlaOp Fft(const XlaOp& operand, FftType fft_type,
-          tensorflow::gtl::ArraySlice<int64> fft_length);
-
-// Enqueues an infeed instruction onto the computation, which writes data of
-// the given shape to the infeed buffer of the device.
-XlaOp Infeed(XlaBuilder* builder, const Shape& shape,
-             const string& config = "");
-
-// Variant of Infeed which takes a token-shaped operand and produces a
-// two-element tuple containing the data value and a token-shaped value.
-// Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
-                      const string& config = "");
-
-// Enqueues an outfeed instruction onto the computation. This instruction
-// generates outgoing data transfers for the given data.
-//
-// shape_with_layout communicates the laid out shape that we want to outfeed
-// -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-// will occur.
-void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
-             const string& outfeed_config);
-
-// Variant of Outfeed which takes a token-shaped operand and produces a
-// token-shaped value. Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
-                       const Shape& shape_with_layout,
-                       const string& outfeed_config);
-
-// Enqueues a call instruction onto the computation.
-XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
-           tensorflow::gtl::ArraySlice<XlaOp> operands);
-
-// Enqueues a custom call instruction onto the computation.
-// During code generation, a call instruction is emitted which targets a
-// symbol with the name |call_target_name|.  The |operands| are passed to the
-// call instruction.  |shape| is the resultant shape.
-XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 tensorflow::gtl::ArraySlice<XlaOp> operands,
-                 const Shape& shape);
-
-// Enqueues a pseudo-op to represent host-side computation data-dependencies.
-// During code generation, host send and receive operations will be generated
-// to transfer |operands| to the host and a single result of |shape| back to
-// the device.  Host send/recv operations are emitted using |channel_name|.
-// Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-// instruction scheduling.
-XlaOp HostCompute(XlaBuilder* builder,
-                  tensorflow::gtl::ArraySlice<XlaOp> operands,
-                  const string& channel_name, int64 cost_estimate_ns,
-                  const Shape& shape);
-
-// The following methods enqueue element-wise binary arithmetic operations
-// onto the computation. The shapes of the operands have to match unless one
-// of the operands is a scalar, or an explicit broadcast dimension is given
-// (see g3doc for more details).
-
-// Enqueues a complex compose instruction onto the computation.
-XlaOp Complex(const XlaOp& real, const XlaOp& imag,
-              tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a complex conjugate instruction onto the computation.
-XlaOp Conj(const XlaOp& operand);
-
-// Enqueues an add instruction onto the computation.
-XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a subtract instruction onto the computation.
-XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a multiply instruction onto the computation.
-XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a divide instruction onto the computation.
-XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a remainder instruction onto the computation.
-XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a max instruction onto the computation.
-XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues a min instruction onto the computation.
-XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Element-wise logical operators
-XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Or(const XlaOp& lhs, const XlaOp& rhs,
-         tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Xor(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-XlaOp Not(const XlaOp& operand);
-
-XlaOp ShiftLeft(const XlaOp& lhs, const XlaOp& rhs,
-                tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-XlaOp ShiftRightArithmetic(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-XlaOp ShiftRightLogical(
-    const XlaOp& lhs, const XlaOp& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Reduces an array among the provided dimensions, given "computation" as a
-// reduction operator.
-XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
-             const XlaComputation& computation,
-             tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-// Convenience wrapper around the above that reduces all the dimensions in the
-// operand shape.
-XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
-                const XlaComputation& computation);
-
-// Enqueues a windowed reduce instruction onto the computation.
-XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
-                   const XlaComputation& computation,
-                   tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                   tensorflow::gtl::ArraySlice<int64> window_strides,
-                   Padding padding);
-
-// As ReduceWindow(), but the padding is given in the format
-// returned by MakePadding().
-XlaOp ReduceWindowWithGeneralPadding(
-    const XlaOp& operand, const XlaOp& init_value,
-    const XlaComputation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-// Returns the sum of the operand value within each subgroup of replicas. All
-// replicas supply one input to the sum and all replicas receive the resulting
-// sum for each subgroup.
-XlaOp CrossReplicaSum(
-    const XlaOp& operand,
-    tensorflow::gtl::ArraySlice<int64> replica_group_ids = {});
-
-// Enqueues an operation that do an AllReduce of the operand cross cores. Here
-// AllReduce means doing a reduction on the input operand cross cores and then
-// broadcasting the reduction result to those cores. The reduction function is
-// defined by `computation`, which should be a commutative computation on
-// scalars, e.g., add, min, or max. The way that AllReduce is applied is
-// configured by:
-//
-// - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
-// replicas belong to one group. Allreduce will be applied within subgroups.
-// For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
-// replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
-//
-// - `channel_id`: for Allreduce nodes from different models, if they have the
-// same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
-// applied cross models.
-//
-// TODO(b/79737069): Rename this to AllReduce when it's ready to use.
-XlaOp CrossReplicaSum(const XlaOp& operand, const XlaComputation& computation,
-                      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
-                      const tensorflow::gtl::optional<ChannelHandle>&
-                          channel_id = tensorflow::gtl::nullopt);
-
-// Enqueues an operation that scatters the `source` array to the selected
-// indices of each window.
-XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
-                       tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                       tensorflow::gtl::ArraySlice<int64> window_strides,
-                       Padding padding, const XlaOp& source,
-                       const XlaOp& init_value, const XlaComputation& scatter);
-
-// As SelectAndScatter(), but the padding is given in the format
-// returned by MakePadding().
-XlaOp SelectAndScatterWithGeneralPadding(
-    const XlaOp& operand, const XlaComputation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const XlaOp& source, const XlaOp& init_value,
-    const XlaComputation& scatter);
-
-// Enqueues an abs instruction onto the computation.
-XlaOp Abs(const XlaOp& operand);
-
-// Enqueues a atan2 instruction onto the computation.
-XlaOp Atan2(const XlaOp& y, const XlaOp& x,
-            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues an exp instruction onto the computation.
-XlaOp Exp(const XlaOp& operand);
-
-// Enqueues an expm1 instruction onto the computation.
-XlaOp Expm1(const XlaOp& operand);
-
-// Enqueues a floor instruction onto the computation.
-XlaOp Floor(const XlaOp& operand);
-
-// Enqueues a ceil instruction onto the computation.
-XlaOp Ceil(const XlaOp& operand);
-
-// Enqueues a round instruction onto the computation, rounding to nearest even
-// with half-way cases rounding away from zero.
-XlaOp Round(const XlaOp& operand);
-
-// Enqueues an log instruction (natural logarithm) onto the computation.
-XlaOp Log(const XlaOp& operand);
-
-// Enqueues an log1p instruction (log(x+1)) onto the computation.
-XlaOp Log1p(const XlaOp& operand);
-
-// Enqueues a sign instruction onto the computation.
-XlaOp Sign(const XlaOp& operand);
-
-// Enqueues a count leading zeros instruction onto the computation.
-XlaOp Clz(const XlaOp& operand);
-
-// Enqueues a cosine instruction onto the computation.
-XlaOp Cos(const XlaOp& operand);
-
-// Enqueues a sine instruction onto the computation.
-XlaOp Sin(const XlaOp& operand);
-
-// Enqueues a tanh instruction onto the computation.
-XlaOp Tanh(const XlaOp& operand);
-
-// Enqueues a real-part instruction onto the computation.
-XlaOp Real(const XlaOp& operand);
-
-// Enqueues an imaginary-part instruction onto the computation.
-XlaOp Imag(const XlaOp& operand);
-
-// Enqueues a lhs^rhs computation onto the computation.
-XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
-          tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-// Enqueues an operator that tests if the operand's values are finite, i.e.,
-// not Inf or NaN. Defined only for floating-point types. Returns an array of
-// booleans with the same shape where entries are true iff the corresponding
-// entry was NaN.
-XlaOp IsFinite(const XlaOp& operand);
-
-// Enqueues a convert instruction onto the computation that changes the
-// element type of the operand array to primitive_type.
-XlaOp ConvertElementType(const XlaOp& operand, PrimitiveType new_element_type);
-
-// Enqueues a no-op instruction onto the computation that changes
-// the element type of the operand array to primitive_type. The
-// bit-widths of the source and destination element types must be
-// identical.
-XlaOp BitcastConvertType(const XlaOp& operand, PrimitiveType new_element_type);
-
-// Enqueues a negate instruction onto the computation.
-XlaOp Neg(const XlaOp& operand);
-
-// Enqueues a transpose instruction onto the computation.
-XlaOp Transpose(const XlaOp& operand,
-                tensorflow::gtl::ArraySlice<int64> permutation);
-
-// Enqueues a reverse instruction onto the computation. The order of the
-// elements in the given dimensions is reversed (i.e., the element at index i
-// is moved to index dimension_size - 1 - i).
-XlaOp Rev(const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions);
-
-// Enqueues a sort (as increasing order) instruction onto the computation.
-// If only keys are provided:
-// * If the keys are an rank-1 tensor (an array), the result is a sorted array
-// of keys, in ascending order.
-// * If the keys have higher rank, the keys are sorted along the provided
-// dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-// value of 0 will indepenently sort every column, and a dimension value of 1
-// will independently sort each row. If no dimension number is provided, then
-// the last dimension is chosen by default.
-//
-// If both keys and values are provided:
-// * The keys and the values must tensors with the same dimensions. The
-// element types of the tensors may be different.
-// * The result is a tuple that consists of a sorted tensor of keys (along the
-// provided dimension, as above) as the first element, and a tensor with their
-// corresponding values as the second element.
-XlaOp Sort(XlaOp keys,
-           tensorflow::gtl::optional<XlaOp> values = tensorflow::gtl::nullopt,
-           int64 dimension = -1);
-
-// Enqueues a clamp instruction onto the computation.
-XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
-
-// Enqueues a map instruction onto the computation.
-XlaOp Map(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> operands,
-          const XlaComputation& computation,
-          tensorflow::gtl::ArraySlice<int64> dimensions,
-          tensorflow::gtl::ArraySlice<XlaOp> static_operands = {});
-
-// Enqueues a N(mu, sigma) random number generation instruction onto the
-// computation.
-XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
-
-// Enqueues a U(a, b) random number generation instruction onto the
-// computation. Returns values in the semi-open interval [a, b).
-XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
-
-// Enqueues a while node onto the computation.
-XlaOp While(const XlaComputation& condition, const XlaComputation& body,
-            const XlaOp& init);
-
-// Enqueues a conditional node onto the computation.
-XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
-                  const XlaComputation& true_computation,
-                  const XlaOp& false_operand,
-                  const XlaComputation& false_computation);
-
-// Enqueues a ReducePrecision node onto the computation.
-XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
-                      const int mantissa_bits);
-
-// Enqueues a Gather node onto the computation.
-XlaOp Gather(const XlaOp& input, const XlaOp& gather_indices,
-             const GatherDimensionNumbers& dimension_numbers,
-             tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-// Enqueues a Send node onto the computation for device-to-device
-// communication. This operation sends the given operand to
-// a Recv instruction in a different computation that shares the same channel
-// handle.
-void Send(const XlaOp& operand, const ChannelHandle& handle);
-
-// Variant of Send which takes a token-shaped operand and produces a
-// token-shaped value.  Tokens are used for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
-                    const ChannelHandle& handle);
-
-// Enqueues a Recv node onto the computation for device-to-device
-// communication. The data comes from a Send instruction in a different
-// computation that shares the same channel handle and its shape must be the
-// same as the given shape.
-XlaOp Recv(XlaBuilder* builder, const Shape& shape,
-           const ChannelHandle& handle);
-
-// Variant of Recv which takes a token-shaped operand and produces a two-element
-// tuple containing the data value and a token-shaped value. Tokens are used
-// for ordering side-effecting operations.
-// TODO(b/110532604): Replace all uses of the non-token form with this variant.
-XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
-                    const ChannelHandle& handle);
-
-// Enqueues a Send node which transfers data from the device to the host. The
-// 'shape_with_layout' argument defines the layout of the data transferred; its
-// shape must be compatible with the shape of the operand. The operand must be
-// array-shaped.
-// TODO(b/111544877): Support tuple shapes.
-XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
-                 const Shape& shape_with_layout, const ChannelHandle& handle);
-
-// Enqueues a Recv node which transfers data from the host to the device. The
-// given shape must contain a layout and must be an array.
-// TODO(b/111544877): Support tuple shapes.
-XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
-                   const ChannelHandle& handle);
-
-// Enqueues an operation (AfterAll) with no operands that produces a
-// token-shaped value.  Tokens are used for ordering side-effecting operations.
-// This is a separate method from AfterAll to facility the removal of
-// operand-less AfterAll instructions.
-// TODO(b/110532604): Remove this function when all tokens are derived from a
-// single token generated or passed into the entry computation.
-XlaOp CreateToken(XlaBuilder* builder);
-
-// Enqueues an AfterAll instruction which produces a token-shaped value and
-// takes a variadic number of token-shaped operands. The number of operands must
-// be greater than zero. Used for joining tokens.
-XlaOp AfterAll(XlaBuilder* builder, tensorflow::gtl::ArraySlice<XlaOp> tokens);
-
-// Normalizes operand across spatial and batch dimensions for each feature.
-//
-// Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-// is the normalized result and batch_mean and batch_var are the mean and
-// variance, respectively, across batch for the operand.
-XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
-                        const XlaOp& offset, float epsilon,
-                        int64 feature_index);
-
-// Normalizes operand across spatial and batch dimensions for each feature.
-//
-// `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-// computing `mean` and `variance` for each batch inside the operation. It
-// uses the input `mean` and `variance` instead as estimated values. The
-// purpose of this op is to reduce latency in inference, hence the name
-// `BatchNormInference`.
-//
-// The output has the same shape as `operand`, and contains the normalized
-// values for each batch.
-XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
-                         const XlaOp& offset, const XlaOp& mean,
-                         const XlaOp& variance, float epsilon,
-                         int64 feature_index);
-
-// Calculates the gradients of a batch norm op.
-//
-// The inputs `batch_mean` and `batch_var` represent the mean and variance
-// across the batch.
-//
-// Returns a tuple of three elements:
-//   - grad_operand: Gradient with respect to input `operand`
-//   - grad_offset: Gradient with respect to input `offset`
-//   - grad_scale: Gradient with respect to input `scale`
-XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
-                    const XlaOp& batch_mean, const XlaOp& batch_var,
-                    const XlaOp& grad_output, float epsilon,
-                    int64 feature_index);
-
-// Implementation details below this point.
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(*LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(*LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR1(int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline XlaOp XlaBuilder::ConstantR1(const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(*LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(*LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArrayWithLayout(const Array<NativeT>& values,
-                                              const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantFromArray(const Array<NativeT>& values) {
-  return ConstantLiteral(*LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR2FromArray2D(const Array2D<NativeT>& values) {
-  return ConstantLiteral(*LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR3FromArray3D(const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-// Free function template implementations.
-
-template <typename NativeT>
-XlaOp ConstantR0(XlaBuilder* builder, NativeT value) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder,
-                 tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR1(XlaBuilder* builder, int64 length, NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(builder, literal);
-}
-
-inline XlaOp ConstantR1(XlaBuilder* builder,
-                        const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR1(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2(XlaBuilder* builder,
-                 std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(builder, *LiteralUtil::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantFromArrayWithLayout(XlaBuilder* builder,
-                                  const Array<NativeT>& values,
-                                  const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantFromArray(XlaBuilder* builder, const Array<NativeT>& values) {
-  return ConstantLiteral(builder,
-                         *LiteralUtil::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2FromArray2DWithLayout(XlaBuilder* builder,
-                                      const Array2D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantR2FromArray2D(XlaBuilder* builder,
-                            const Array2D<NativeT>& values) {
-  return ConstantLiteral(builder,
-                         *LiteralUtil::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-XlaOp ConstantR3FromArray3DWithLayout(XlaBuilder* builder,
-                                      const Array3D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantLiteral(
-      builder,
-      *LiteralUtil::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-XlaOp ConstantR3FromArray3D(XlaBuilder* builder,
-                            const Array3D<NativeT>& values) {
-  return ConstantFromArray(builder, values);
-}
-
-template <typename NativeT>
-XlaOp ConstantR4FromArray4DWithLayout(XlaBuilder* builder,
-                                      const Array4D<NativeT>& values,
-                                      const Layout& layout) {
-  return ConstantFromArrayWithLayout(builder, values, layout);
-}
-
-template <typename NativeT>
-XlaOp ConstantR4FromArray4D(XlaBuilder* builder,
-                            const Array4D<NativeT>& values) {
-  return ConstantFromArray(builder, values);
-}
-
-}  // namespace xla
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
similarity index 94%
rename from tensorflow/compiler/xla/client/xla_client/xla_computation.cc
rename to tensorflow/compiler/xla/client/xla_computation.cc
index 72e3935696e0c44ae3893fc8f1ceb261fa5e2646..3543d41fc2656ec028646edebc0bf5b6af7f67a5 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
similarity index 90%
rename from tensorflow/compiler/xla/client/xla_client/xla_computation.h
rename to tensorflow/compiler/xla/client/xla_computation.h
index 0ffba208b1f8683fe1d26107cbfd096b856267f1..71598ef8b296a760b0ee818fce0a59aed5cfc6b4 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
 
 #include <utility>
 
@@ -64,4 +64,4 @@ class XlaComputation {
 
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_COMPUTATION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index 5db124b5a226238931a2038969d1f131743e554a..0545deb096e9eace5a9713f200e10559aa718441 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1775,7 +1775,9 @@ void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
       // Nothing to do but assign the shape which is done above.
       return;
     default:
-      LOG(FATAL) << "Unhandled primitive type " << subshape().element_type();
+      // TODO(b/111551621): Support serializing more PrimitiveTypes.
+      LOG(FATAL) << "Unhandled primitive type "
+                 << PrimitiveType_Name(subshape().element_type());
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index fe346f9956adaed8d0127e92517b2cd32be05105..c8f2d65c223ccfe20862954c224d016cca421812 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -53,9 +53,9 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 66b1c08a391703afb0474a0e947efcca0ebb00bf..434d78d78dd58f8bfcb992eb4f3d81beaadb56c3 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -486,6 +486,11 @@ LocalOp LocalComputationBuilder::ConvertElementType(
   return xla::ConvertElementType(operand.op(), new_element_type);
 }
 
+LocalOp LocalComputationBuilder::BitcastConvertType(
+    const LocalOp& operand, PrimitiveType new_element_type) {
+  return xla::BitcastConvertType(operand.op(), new_element_type);
+}
+
 LocalOp LocalComputationBuilder::Call(
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<LocalOp> operands) {
@@ -617,6 +622,8 @@ _FORWARD_BINOP(Xor)
 _FORWARD_BINOP(ShiftLeft)
 _FORWARD_BINOP(ShiftRightArithmetic)
 _FORWARD_BINOP(ShiftRightLogical)
+_FORWARD_BINOP(Atan2)
+_FORWARD_BINOP(Pow)
 _FORWARD_UNOP(Not)
 _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
@@ -630,13 +637,27 @@ _FORWARD_UNOP(Sign)
 _FORWARD_UNOP(Cos)
 _FORWARD_UNOP(Sin)
 _FORWARD_UNOP(Tanh)
-_FORWARD_UNOP(Sqrt)
-_FORWARD_UNOP(Square)
-_FORWARD_BINOP(Pow)
 _FORWARD_UNOP(IsFinite)
-_FORWARD_UNOP(Reciprocal)
 _FORWARD_UNOP(Neg)
 _FORWARD_UNOP(Sort)
+_FORWARD_UNOP(Sqrt)
+_FORWARD_UNOP(Rsqrt)
+_FORWARD_UNOP(Square)
+_FORWARD_UNOP(Reciprocal)
+_FORWARD_UNOP(Erfc)
+_FORWARD_UNOP(Erf)
+_FORWARD_UNOP(ErfInv)
+_FORWARD_UNOP(Lgamma)
+_FORWARD_UNOP(Digamma)
+_FORWARD_UNOP(Acos)
+_FORWARD_UNOP(Asin)
+_FORWARD_UNOP(Atan)
+_FORWARD_UNOP(Tan)
+_FORWARD_UNOP(Acosh)
+_FORWARD_UNOP(Asinh)
+_FORWARD_UNOP(Atanh)
+_FORWARD_UNOP(Cosh)
+_FORWARD_UNOP(Sinh)
 
 #undef _FORWARD
 #undef _FORWARD_UNOP
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 17ad04457809a384b56975f892f9567dfe163787..545aa63f9d6e2e2e26c26f49941a5160279154b3 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -259,6 +259,9 @@ class LocalComputationBuilder {
   LocalOp ConvertElementType(const LocalOp& operand,
                              PrimitiveType new_element_type);
 
+  LocalOp BitcastConvertType(const LocalOp& operand,
+                             PrimitiveType new_element_type);
+
   LocalOp Call(const LocalComputation& local_computation,
                tensorflow::gtl::ArraySlice<LocalOp> operands);
 
@@ -336,6 +339,8 @@ class LocalComputationBuilder {
   _FORWARD_BINOP(ShiftLeft)
   _FORWARD_BINOP(ShiftRightArithmetic)
   _FORWARD_BINOP(ShiftRightLogical)
+  _FORWARD_BINOP(Atan2)
+  _FORWARD_BINOP(Pow)
   _FORWARD_UNOP(Not)
   _FORWARD_UNOP(Abs)
   _FORWARD_UNOP(Exp)
@@ -349,13 +354,27 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Cos)
   _FORWARD_UNOP(Sin)
   _FORWARD_UNOP(Tanh)
-  _FORWARD_UNOP(Sqrt)
-  _FORWARD_UNOP(Square)
-  _FORWARD_BINOP(Pow)
   _FORWARD_UNOP(IsFinite)
-  _FORWARD_UNOP(Reciprocal)
   _FORWARD_UNOP(Neg)
   _FORWARD_UNOP(Sort)
+  _FORWARD_UNOP(Sqrt)
+  _FORWARD_UNOP(Rsqrt)
+  _FORWARD_UNOP(Square)
+  _FORWARD_UNOP(Reciprocal)
+  _FORWARD_UNOP(Erfc)
+  _FORWARD_UNOP(Erf)
+  _FORWARD_UNOP(ErfInv)
+  _FORWARD_UNOP(Lgamma)
+  _FORWARD_UNOP(Digamma)
+  _FORWARD_UNOP(Acos)
+  _FORWARD_UNOP(Asin)
+  _FORWARD_UNOP(Atan)
+  _FORWARD_UNOP(Tan)
+  _FORWARD_UNOP(Acosh)
+  _FORWARD_UNOP(Asinh)
+  _FORWARD_UNOP(Atanh)
+  _FORWARD_UNOP(Cosh)
+  _FORWARD_UNOP(Sinh)
 
 #undef _FORWARD
 #undef _FORWARD_UNOP
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 42bf76e5d8f7236cb2dbab597a0243696e283aab..9b8b0aa7f28e64f434bb24f88a3a9cbe177f8a78 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -957,6 +957,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Tuple;
 %unignore xla::swig::LocalComputationBuilder::GetTupleElement;
 %unignore xla::swig::LocalComputationBuilder::ConvertElementType;
+%unignore xla::swig::LocalComputationBuilder::BitcastConvertType;
 %unignore xla::swig::LocalComputationBuilder::Call;
 %unignore xla::swig::LocalComputationBuilder::Transpose;
 %unignore xla::swig::LocalComputationBuilder::Rev;
@@ -1005,13 +1006,29 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Cos;
 %unignore xla::swig::LocalComputationBuilder::Sin;
 %unignore xla::swig::LocalComputationBuilder::Tanh;
-%unignore xla::swig::LocalComputationBuilder::Sqrt;
-%unignore xla::swig::LocalComputationBuilder::Square;
-%unignore xla::swig::LocalComputationBuilder::Pow;
+%unignore xla::swig::LocalComputationBuilder::Atan2;
 %unignore xla::swig::LocalComputationBuilder::IsFinite;
-%unignore xla::swig::LocalComputationBuilder::Reciprocal;
+%unignore xla::swig::LocalComputationBuilder::Pow;
 %unignore xla::swig::LocalComputationBuilder::Neg;
 %unignore xla::swig::LocalComputationBuilder::Sort;
+%unignore xla::swig::LocalComputationBuilder::Sqrt;
+%unignore xla::swig::LocalComputationBuilder::Rsqrt;
+%unignore xla::swig::LocalComputationBuilder::Square;
+%unignore xla::swig::LocalComputationBuilder::Reciprocal;
+%unignore xla::swig::LocalComputationBuilder::Erfc;
+%unignore xla::swig::LocalComputationBuilder::Erf;
+%unignore xla::swig::LocalComputationBuilder::ErfInv;
+%unignore xla::swig::LocalComputationBuilder::Lgamma;
+%unignore xla::swig::LocalComputationBuilder::Digamma;
+%unignore xla::swig::LocalComputationBuilder::Acos;
+%unignore xla::swig::LocalComputationBuilder::Asin;
+%unignore xla::swig::LocalComputationBuilder::Atan;
+%unignore xla::swig::LocalComputationBuilder::Tan;
+%unignore xla::swig::LocalComputationBuilder::Acosh;
+%unignore xla::swig::LocalComputationBuilder::Asinh;
+%unignore xla::swig::LocalComputationBuilder::Atanh;
+%unignore xla::swig::LocalComputationBuilder::Cosh;
+%unignore xla::swig::LocalComputationBuilder::Sinh;
 %unignore xla::swig::DestructureLocalShapedBufferTuple;
 %unignore xla::swig::DeleteLocalShapedBuffer;
 %unignore xla::swig::DeleteLocalComputation;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index f93d7bda2d1bb62bc5f322b63ae4b20e2eedcd7a..c0105b385b02e13b360ad1fb5af734d2209a92c2 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -99,12 +99,27 @@ _UNARY_OPS = [
     'Cos',
     'Sin',
     'Tanh',
+    'IsFinite',
     'Sqrt',
+    'Rsqrt',
     'Square',
-    'IsFinite',
     'Reciprocal',
     'Neg',
     'Sort',
+    'Erf',
+    'Erfc',
+    'ErfInv',
+    'Lgamma',
+    'Digamma',
+    'Acos',
+    'Asin',
+    'Atan',
+    'Tan',
+    'Acosh',
+    'Asinh',
+    'Atanh',
+    'Cosh',
+    'Sinh',
 ]
 
 _BINARY_OPS = [
@@ -128,6 +143,7 @@ _BINARY_OPS = [
     'ShiftLeft',
     'ShiftRightArithmetic',
     'ShiftRightLogical',
+    'Atan2',
 ]
 
 
@@ -705,6 +721,18 @@ class ComputationBuilder(object):
     """
     return self._client.ConvertElementType(operand, new_element_type)
 
+  def BitcastConvertType(self, operand, new_element_type):
+    """Enqueues a bitcast type conversion operation onto the computation.
+
+    Args:
+      operand: the operand to convert.
+      new_element_type: the target primitive type.
+
+    Returns:
+      A LocalOp representing the added conversion op.
+    """
+    return self._client.BitcastConvertType(operand, new_element_type)
+
   def GetShape(self, operand):
     return _wrap_shape(self._client.GetShape(operand))
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 93177aa6479134ba427f8174e9c85dbe5eda225b..fd98e19457f61aade947aa354d2e415148d127f6 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -489,6 +489,34 @@ class SingleOpTest(LocalComputationTest):
     for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
       _ConvertAndTest(x, src_dtype, dst_dtype)
 
+  def testBitcastConvertType(self):
+    xla_x32_types = {
+        np.int32: xla_client.xla_data_pb2.S32,
+        np.float32: xla_client.xla_data_pb2.F32,
+    }
+
+    xla_x64_types = {
+        np.int64: xla_client.xla_data_pb2.S64,
+        np.float64: xla_client.xla_data_pb2.F64,
+    }
+
+    def _ConvertAndTest(template, src_dtype, dst_dtype, dst_etype):
+      c = self._NewComputation()
+      x = c.Constant(np.array(template, dtype=src_dtype))
+      c.BitcastConvertType(x, dst_etype)
+
+      result = c.Build().Compile().Execute()
+      expected = np.array(template, src_dtype).view(dst_dtype)
+
+      self.assertEqual(result.shape, expected.shape)
+      self.assertEqual(result.dtype, expected.dtype)
+      np.testing.assert_equal(result, expected)
+
+    x = [0, 1, 0, 0, 1]
+    for xla_types in [xla_x32_types, xla_x64_types]:
+      for src_dtype, dst_dtype in itertools.product(xla_types, xla_types):
+        _ConvertAndTest(x, src_dtype, dst_dtype, xla_types[dst_dtype])
+
   def testCrossReplicaSumOneReplica(self):
     samples = [
         NumpyArrayF32(42.0),
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index 6397f1f47915aaa559beda467c26c66795c98f60..a803520876952a0ab67ecb827b1f256c915335f9 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <array>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 0b1cec1925d4424db086f8a3f62c91ede090189c..44b22a5586dee3f7dd8ea0edbf9deb2090986ac8 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -56,7 +56,7 @@ tf_cc_test(
         ":grpc_stub",
         "//tensorflow:grpc++",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index 90efee50b4f19056fac8ef1b341b48175903ff83..67886761813f0bb45a600661b017be91ffeade73 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "grpcpp/security/credentials.h"
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/rpc/grpc_stub.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index cba7883fde855ec6b1cd08184eab4a19afa5722d..528b7fdfd3c39cc3a56afc92474dbae976a08ba8 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -256,7 +256,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -564,7 +564,7 @@ cc_library(
         ":computation_placer",
         ":device_memory_allocator",
         ":platform_util",
-        ":pool",
+        ":stream_pool",
         ":transfer_manager",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -598,6 +598,7 @@ cc_library(
         ":hlo_proto_util",
         ":platform_util",
         ":source_map_util",
+        ":stream_pool",
         ":transfer_manager",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -642,7 +643,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:executable_build_options",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
@@ -751,8 +752,8 @@ cc_library(
         ":hlo_execution_profile",
         ":hlo_graph_dumper",
         ":hlo_proto",
-        ":pool",
         ":shaped_buffer",
+        ":stream_pool",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -838,7 +839,7 @@ cc_library(
     hdrs = ["execution_tracker.h"],
     deps = [
         ":backend",
-        ":pool",
+        ":stream_pool",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -946,7 +947,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
@@ -1663,8 +1663,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2670,7 +2670,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -2707,7 +2707,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2715,21 +2715,25 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "pool",
-    hdrs = ["pool.h"],
+    name = "stream_pool",
+    srcs = ["stream_pool.cc"],
+    hdrs = ["stream_pool.h"],
     deps = [
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
 tf_cc_test(
-    name = "pool_test",
-    srcs = ["pool_test.cc"],
+    name = "stream_pool_test",
+    srcs = ["stream_pool_test.cc"],
     deps = [
-        ":pool",
+        ":stream_pool",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 2205a7ec18a2091f574a33eca6aff9e16f098871..505c0e8dff44ace09bd67f54ecb3f2716a2fb167 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1744,19 +1744,37 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
   if (ReplaceInstructionIfSameShape(slice, slice->mutable_operand(0))) {
     return Status::OK();
   }
+
+  auto is_unstrided_slice = [](const HloInstruction* hlo) {
+    return c_all_of(hlo->slice_strides(),
+                    [](int64 stride) { return stride == 1; });
+  };
+  if (slice->operand(0)->opcode() == HloOpcode::kSlice &&
+      is_unstrided_slice(slice) && is_unstrided_slice(slice->operand(0))) {
+    HloInstruction* operand_slice = slice->mutable_operand(0);
+    std::vector<int64> new_slice_starts = slice->slice_starts();
+    std::vector<int64> new_slice_limits = slice->slice_limits();
+    for (int64 i = 0; i < new_slice_starts.size(); ++i) {
+      new_slice_starts[i] += operand_slice->slice_starts(i);
+      new_slice_limits[i] += operand_slice->slice_starts(i);
+    }
+    return ReplaceWithNewInstruction(
+        slice, HloInstruction::CreateSlice(
+                   slice->shape(), operand_slice->mutable_operand(0),
+                   new_slice_starts, new_slice_limits, slice->slice_strides()));
+  }
   return Status::OK();
 }
 
 Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     HloInstruction* dynamic_slice) {
   auto operand = dynamic_slice->mutable_operand(0);
-  auto start_indices = dynamic_slice->operand(1);
   if (ShapeUtil::IsScalar(dynamic_slice->shape())) {
     return ReplaceInstruction(dynamic_slice, operand);
   }
-  // DynamicSlice where operand has the same size as the output and
-  // start_indices are all zero is simply equal to operand.
-  if (IsAll(start_indices, 0) && SameShape(operand, dynamic_slice)) {
+  // DynamicSlice where operand has the same size as the output is simply equal
+  // to operand.
+  if (SameShape(operand, dynamic_slice)) {
     return ReplaceInstruction(dynamic_slice, operand);
   }
   return Status::OK();
@@ -1765,20 +1783,10 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
 Status AlgebraicSimplifierVisitor::HandleDynamicUpdateSlice(
     HloInstruction* dynamic_update_slice) {
   auto update = dynamic_update_slice->mutable_operand(1);
-  auto start_indices = dynamic_update_slice->operand(2);
-  // DynamicUpdateSlice on a scalar just passes through the update argument.
-  if (ShapeUtil::IsScalar(dynamic_update_slice->shape())) {
-    return ReplaceInstruction(dynamic_update_slice, update);
-  }
 
-  // DynamicUpdateSlice where operand and update have the same size and
-  // start_indices are all zero is simply equal to update.
-  //
-  // (We require start_indices to be all zero because we want this optimization
-  // not to affect the visible behavior of this op even when the indices are out
-  // of range.  Currently dynamic-update-slice wraps out-of-range indices, so
-  // we can only remove the op if its indices never wrap.)
-  if (IsAll(start_indices, 0) && SameShape(dynamic_update_slice, update)) {
+  // DynamicUpdateSlice where operand and update have the same size is simply
+  // equal to update.
+  if (SameShape(dynamic_update_slice, update)) {
     return ReplaceInstruction(dynamic_update_slice, update);
   }
 
@@ -1904,6 +1912,26 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                       new_reduce_dimensions, function));
     }
   }
+  // Convert Reduce(concat({a,b,...})) to
+  //  map(reduce(a),map(reduce(b),...,))
+  //
+  // This should make fusion easier or use less memory bandwidth in the unfused
+  // case.
+  if (arg->opcode() == HloOpcode::kConcatenate &&
+      c_linear_search(reduce->dimensions(), arg->concatenate_dimension())) {
+    HloInstruction* old_reduce = nullptr;
+    for (HloInstruction* operand : arg->operands()) {
+      HloInstruction* new_reduce = computation_->AddInstruction(
+          HloInstruction::CreateReduce(reduce->shape(), operand, init_value,
+                                       reduce->dimensions(), function));
+      if (old_reduce != nullptr) {
+        new_reduce = computation_->AddInstruction(HloInstruction::CreateMap(
+            reduce->shape(), {old_reduce, new_reduce}, function));
+      }
+      old_reduce = new_reduce;
+    }
+    return ReplaceInstruction(reduce, old_reduce);
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 3f0f2afadd4238f631db29084aa6bf279d5b2a70..8b81b4c97ef373bcfb89bf0761ebb16b6e14e3fc 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1250,6 +1250,55 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
               op::Concatenate(param0, param0, param1));
 }
 
+// Test that reduce of concat is simplified.
+TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
+  const int kParamLength = 100;
+  Shape r3f32 =
+      ShapeUtil::MakeShape(F32, {kParamLength, kParamLength, kParamLength});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r3f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r3f32, "param1"));
+  HloInstruction* param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, r3f32, "param2"));
+  Shape concat_shape =
+      ShapeUtil::MakeShape(F32, {kParamLength, 3 * kParamLength, kParamLength});
+  HloInstruction* Concatenate =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          concat_shape, {param0, param1, param2}, 1));
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+  Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
+  Shape reduce_shape = ShapeUtil::MakeShape(F32, {kParamLength});
+
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+  builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, Concatenate, zero, {1, 2}, add_computation));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+
+  EXPECT_THAT(
+      computation->root_instruction(),
+      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
+              op::Reduce(param2, zero)));
+}
+
 // Test a concatenate with only empty operands is removed.
 TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   const int kParamLength = 100;
@@ -1859,6 +1908,39 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   EXPECT_THAT(computation->root_instruction(), param);
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
+  HloComputation::Builder builder(TestName());
+  const int64 dim0 = 11;
+  const int64 dim1 = 12;
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {dim0, dim1}), "param"));
+  HloInstruction* original_slice =
+      builder.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {dim0 - 2, dim1 - 4}), param,
+          /*start_indices=*/{1, 2},
+          /*limit_indices=*/{dim0 - 1, dim1 - 2}, /*strides=*/{1, 1}));
+
+  builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {dim0 - 5, dim1 - 9}), original_slice,
+      /*start_indices=*/{2, 3},
+      /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
+  EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
+  EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
+  EXPECT_EQ(computation->root_instruction()->slice_limits(1), dim1 - 4);
+}
+
 TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
   struct ConvTestOptions {
     int in_batch = 10;
@@ -2407,8 +2489,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
       shape,
       builder.AddInstruction(
           HloInstruction::CreateParameter(0, shape, "slice_from")),
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int>({0, 0, 0}))),
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          1, ShapeUtil::MakeShape(U32, {3}), "slice_indices")),
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = module().AddEntryComputation(builder.Build());
@@ -2441,8 +2523,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
       builder.AddInstruction(
           HloInstruction::CreateParameter(2, slice_shape, "to_update")),
       slice,
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int>({0, 0, 0})))));
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
   auto computation = module().AddEntryComputation(builder.Build());
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 349b32451a697dbd6804b44cd1a36419c753bb14..d12be3e007fe0b16ac850d64521f0025d481b5d2 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -96,24 +96,19 @@ Backend::CreateDefaultBackend() {
   return CreateBackend(backend_options);
 }
 
-StatusOr<Backend::StreamPtr> Backend::BorrowStream(int device_ordinal) {
-  TF_ASSIGN_OR_RETURN(auto exec, stream_executor(device_ordinal));
-  return BorrowStream(exec);
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
+  TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal));
+  return BorrowStream(executor);
 }
 
-StatusOr<Backend::StreamPtr> Backend::BorrowStream(
-    se::StreamExecutor* executor) {
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
   tensorflow::mutex_lock l(mu_);
   if (0 == stream_pools_.count(executor)) {
     stream_pools_.emplace(std::piecewise_construct,
                           std::forward_as_tuple(executor),
-                          std::forward_as_tuple([executor]() {
-                            auto stream = MakeUnique<se::Stream>(executor);
-                            stream->Init();
-                            return stream;
-                          }));
+                          std::forward_as_tuple());
   }
-  return stream_pools_.at(executor).Allocate();
+  return stream_pools_.at(executor).BorrowStream(executor);
 }
 
 Backend::Backend(
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 6546602473e3381cf13879ddebd05d34d1f7a055..1bc3796fa48c1627538474d04ef5358ba64dfce9 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -63,11 +63,9 @@ class BackendOptions {
 //
 // It also offers a pooling API for creation/use of initialized streams:
 //
-//    StreamPtr stream = backend->BorrowStream().ConsumeValueOrDie();
+//    StreamPool::Ptr stream = backend->BorrowStream().ConsumeValueOrDie();
 class Backend {
  public:
-  using StreamPtr = Pool<se::Stream>::SmartPtr;
-
   // Creates a new backend.
   static StatusOr<std::unique_ptr<Backend>> CreateBackend(
       const BackendOptions& options);
@@ -114,13 +112,13 @@ class Backend {
   // Borrows a stream for use by the caller, either by grabbing it from an
   // internal pool, or by constructing/initializating it, and returns the result
   // to the caller.
-  StatusOr<StreamPtr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPtr> BorrowStream(se::StreamExecutor* executor);
+  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal);
+  StatusOr<StreamPool::Ptr> BorrowStream(se::StreamExecutor* executor);
 
   // Returns a function to borrow a stream, as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
   // function itself.
-  std::function<StatusOr<StreamPtr>(int)> StreamBorrower() {
+  std::function<StatusOr<StreamPool::Ptr>(int)> StreamBorrower() {
     return [this](int device_ordinal) { return BorrowStream(device_ordinal); };
   }
 
@@ -169,7 +167,7 @@ class Backend {
   tensorflow::mutex mu_;
 
   // Mapping from stream executor to stream pools, used by `BorrowStream` above.
-  std::map<se::StreamExecutor*, Pool<se::Stream>> stream_pools_ GUARDED_BY(mu_);
+  std::map<se::StreamExecutor*, StreamPool> stream_pools_ GUARDED_BY(mu_);
 
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index b21c83a07f69d6ec93cf9305802e4d3af2783bdc..2fb401c4289728f3f59538464c5b8ad49957985b 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -215,7 +215,12 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
     if (ContainsKey(values_that_must_be_kept_as_f32_, value)) {
       return false;
     }
-    if (ValueTypeAfterChange(value) == BF16) {
+    // We use the original type for the value because we are going to examine
+    // the uses of it, instead of the value itself. If ValueTypeAfterChange()
+    // were used, it would cause problems when there are aliasing buffers, i.e.,
+    // ResolveInconsistencyOfAliasingBuffers() would fail to revert the
+    // tentative change to BF16 even if the uses require F32.
+    if (value->shape().element_type() == BF16) {
       continue;
     }
     for (const HloUse& use : value->uses()) {
@@ -566,6 +571,9 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
       }
       visited_computations->insert(visited_in_while.begin(),
                                    visited_in_while.end());
+    } else if (hlo->opcode() == HloOpcode::kFusion) {
+      ResolveInconsistencyOfAliasingBuffersHelper(
+          hlo->fused_instructions_computation(), visited_computations);
     }
   }
   // Now adjust parameters of called computations.
@@ -769,8 +777,7 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
   // propagation in reverse topological order.
   for (auto comp_it = computations_topological_order.rbegin();
        comp_it != computations_topological_order.rend(); ++comp_it) {
-    if ((*comp_it)->IsFusionComputation()) {
-      // Fusion computations are handled when visiting the fusion instruction.
+    if (ContainsKey(computations_visited_in_backward_pass_, *comp_it)) {
       continue;
     }
     auto insts = (*comp_it)->MakeInstructionPostOrder();
@@ -778,6 +785,7 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
       DetermineInstructionPrecision(*inst_it,
                                     /*skip_parameters=*/true);
     }
+    computations_visited_in_backward_pass_.insert(*comp_it);
   }
 
   // It's possible that an instruction does not define a buffer, but the
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index aeafb25ad7215ea3d297e4a8bf7e1ba72d33d528..69b654d30e42b1ed69304206f09120e86831d468 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -508,6 +508,63 @@ TEST_F(BFloat16PropagationTest, PropagateThroughSimpleWhile) {
   EXPECT_FALSE(OutputsBF16(dot));
 }
 
+// Tests that if the while condition prevents using BF16, no changes should be
+// made to the while body and thus the fusion node inside it.
+TEST_F(BFloat16PropagationTest,
+       ConditionPreventsPropagationForFusionInsideWhile) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {4, 4});
+
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "param1"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param0, param1));
+
+  auto builder_cond = HloComputation::Builder("cond");
+  auto cond_param = builder_cond.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "cond_param"));
+  builder_cond.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kGt,
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_param, {0, 0}, {1, 1}, {1, 1})),
+      builder_cond.AddInstruction(HloInstruction::CreateSlice(
+          ShapeUtil::MakeShape(F32, {}), cond_param, {1, 1}, {2, 2}, {1, 1}))));
+  auto cond = module->AddEmbeddedComputation(builder_cond.Build());
+
+  auto builder_body = HloComputation::Builder("body");
+  auto body_param = builder_body.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "body_param"));
+  auto body_transpose = builder_body.AddInstruction(
+      HloInstruction::CreateTranspose(shape, body_param, {0, 1}));
+
+  auto builder_f = HloComputation::Builder("fusion");
+  HloInstruction* a_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  builder_f.AddInstruction(HloInstruction::CreateTranspose(shape, a_f, {0, 1}));
+  auto comp_f = module->AddEmbeddedComputation(builder_f.Build());
+  auto body_fusion = builder_body.AddInstruction(HloInstruction::CreateFusion(
+      shape, HloInstruction::FusionKind::kCustom, {body_transpose}, comp_f));
+  auto body = module->AddEmbeddedComputation(builder_body.Build());
+
+  auto while_hlo = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, cond, body, add));
+
+  auto dot = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape, HloOpcode::kDot, while_hlo, while_hlo));
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+  EXPECT_EQ(computation->root_instruction(), dot);
+  EXPECT_FALSE(OutputsBF16(add));
+  EXPECT_FALSE(OutputsBF16(body_fusion));
+  EXPECT_FALSE(OutputsBF16(body_param));
+  EXPECT_FALSE(OutputsBF16(body_transpose));
+  EXPECT_FALSE(OutputsBF16(a_f));
+}
+
 // Tests that BF16 is propagated properly through while computations with
 // tuple-shaped input/output.
 TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
@@ -553,10 +610,14 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
       HloInstruction::CreateGetTupleElement(shape, body_param, 0));
   auto body_rhs = builder_body.AddInstruction(
       HloInstruction::CreateGetTupleElement(shape, body_param, 1));
-  auto body_dot = builder_body.AddInstruction(
+  auto body_dot1 = builder_body.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_lhs, body_rhs));
+  auto body_dot2 = builder_body.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kDot, body_rhs, body_lhs));
+  auto body_transpose = builder_body.AddInstruction(
+      HloInstruction::CreateTranspose(shape, body_dot2, {0, 1}));
   builder_body.AddInstruction(
-      HloInstruction::CreateTuple({body_dot, body_rhs}));
+      HloInstruction::CreateTuple({body_dot1, body_transpose}));
   auto body = module->AddEmbeddedComputation(builder_body.Build());
 
   auto while_hlo = builder.AddInstruction(
@@ -575,9 +636,11 @@ TEST_F(BFloat16PropagationTest, PropagateThroughTupleWhile) {
   EXPECT_EQ(computation->root_instruction(), dot);
   EXPECT_TRUE(OutputsBF16(lhs));
   EXPECT_FALSE(OutputsBF16(rhs));
-  EXPECT_TRUE(OutputsBF16(body_dot));
+  EXPECT_TRUE(OutputsBF16(body_dot1));
   EXPECT_TRUE(OutputsBF16(body_lhs));
   EXPECT_FALSE(OutputsBF16(body_rhs));
+  EXPECT_FALSE(OutputsBF16(body_dot2));
+  EXPECT_FALSE(OutputsBF16(body_transpose));
   EXPECT_TRUE(OutputsBF16(cond_lhs));
   EXPECT_FALSE(OutputsBF16(cond_rhs));
   EXPECT_TRUE(OutputsBF16(add0));
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 783e3f7e7377e90e27e5462b1c482fd57d4ae38c..e4d2e73b994819f748bceb6a9b2f9c1ca7c16308 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -270,7 +270,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_index(index_);
   proto.set_size(size_);
   proto.set_is_thread_local(is_thread_local_);
-  proto.set_is_reusable(is_reusable_);
+  proto.set_is_tuple(is_tuple_);
   proto.set_color(color_.value());
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
@@ -279,6 +279,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
     }
     proto.set_parameter_number(parameter_number_);
   }
+  proto.set_is_constant(is_constant_);
   proto.set_maybe_live_out(maybe_live_out_);
   for (const auto& buffer_offset_size : assigned_buffers_) {
     BufferAllocationProto::Assigned* proto_assigned = proto.add_assigned();
@@ -304,6 +305,9 @@ string BufferAllocation::ToString() const {
     StrAppend(&output, ", parameter ", parameter_number(), " at ShapeIndex ",
               param_shape_index().ToString());
   }
+  if (is_constant()) {
+    StrAppend(&output, ", constant");
+  }
   if (is_thread_local()) {
     StrAppend(&output, ", thread-local");
   }
@@ -491,20 +495,16 @@ BufferAssignment::GetUniqueTopLevelOutputSlice() const {
 }
 
 BufferAllocation* BufferAssignment::NewEmptyAllocation(
-    int64 size, bool is_thread_local, bool is_reusable,
-    LogicalBuffer::Color color) {
+    int64 size, LogicalBuffer::Color color) {
   BufferAllocation::Index index = allocations_.size();
-  allocations_.emplace_back(index, size, is_thread_local, is_reusable, color);
+  allocations_.emplace_back(index, size, color);
   BufferAllocation* allocation = &allocations_.back();
   return allocation;
 }
 
 BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
-                                                  int64 size,
-                                                  bool is_thread_local,
-                                                  bool is_reusable) {
-  BufferAllocation* allocation =
-      NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
+                                                  int64 size) {
+  BufferAllocation* allocation = NewEmptyAllocation(size, buffer.color());
   AddAssignment(allocation, buffer, /*offset=*/0, size);
   allocation->peak_buffers_.push_back(&buffer);
   return allocation;
@@ -517,7 +517,8 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
   CHECK_EQ(0, allocation_index_for_buffer_.count(&buffer))
       << "LogicalBuffer " << buffer << " already has an allocation.";
   CHECK(allocation->is_reusable() || allocation->assigned_buffers().empty())
-      << "Non-reusable allocation already assigned a buffer";
+      << "Non-reusable allocation already assigned a buffer: "
+      << allocation->ToString();
 
   TF_CHECK_OK(points_to_analysis().VerifyBuffer(buffer));
 
@@ -609,6 +610,10 @@ Status BufferAssignment::ComputeSummaryStats() {
       stats_.parameter_allocation_count++;
       stats_.parameter_allocation_bytes += allocation.size();
     }
+    if (allocation.is_constant()) {
+      stats_.constant_allocation_count++;
+      stats_.constant_allocation_bytes += allocation.size();
+    }
     if (allocation.maybe_live_out()) {
       stats_.maybe_live_out_allocation_count++;
       stats_.maybe_live_out_allocation_bytes += allocation.size();
@@ -645,6 +650,8 @@ string BufferAssignment::Stats::ToString() const {
   Appendf(&s, "BufferAssignment stats:\n");
   Appendf(&s, "             parameter allocation: %10s\n",
           HumanReadableNumBytes(parameter_allocation_bytes).c_str());
+  Appendf(&s, "              constant allocation: %10s\n",
+          HumanReadableNumBytes(constant_allocation_bytes).c_str());
   Appendf(&s, "        maybe_live_out allocation: %10s\n",
           HumanReadableNumBytes(maybe_live_out_allocation_bytes).c_str());
   Appendf(&s, "     preallocated temp allocation: %10s\n",
@@ -722,8 +729,10 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
     LogicalBuffer::SizeFunction buffer_size,
     LogicalBuffer::AlignmentFunction color_alignment,
-    bool allow_input_output_aliasing, BufferLiveness::Colorer colorer) {
-  BufferAssigner assigner(allow_input_output_aliasing, std::move(colorer));
+    bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
+    BufferLiveness::Colorer colorer) {
+  BufferAssigner assigner(allow_input_output_aliasing,
+                          allocate_buffers_for_constants, std::move(colorer));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
                                    std::move(color_alignment));
@@ -751,8 +760,8 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
     return false;
   }
 
-  if (allocation->is_entry_computation_parameter()) {
-    VLOG(4) << "Can't assign: allocation holds parameter";
+  if (allocation->is_readonly()) {
+    VLOG(4) << "Can't assign: allocation is readonly";
     return false;
   }
 
@@ -808,8 +817,7 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
 }
 
 Status BufferAssigner::AssignBuffersForComputation(
-    const HloComputation* computation, const DebugOptions& debug_options,
-    bool is_thread_local,
+    const HloComputation* computation, bool is_thread_local,
     const FlatSet<const LogicalBuffer*>& colocated_buffers,
     const FlatSet<BufferAllocation::Index>& colocated_allocations,
     FlatMap<const HloComputation*, FlatSet<const LogicalBuffer*>>*
@@ -905,15 +913,19 @@ Status BufferAssigner::AssignBuffersForComputation(
     TF_RET_CHECK(!assignment->HasAllocation(*buffer));
 
     const HloInstruction* instruction = buffer->instruction();
+    const int64 buffer_size = assignment->buffer_size_(*buffer);
+
     if (instruction->opcode() == HloOpcode::kConstant) {
-      // No BufferAllocations for constants.
-      // TODO(b/32248867): For consistency, constants should get allocations.
-      VLOG(3) << "Skipping constant: " << *buffer;
+      if (allocate_buffers_for_constants_) {
+        BufferAllocation* allocation =
+            assignment->NewAllocation(*buffer, buffer_size);
+        allocation->set_constant(true);
+        VLOG(3) << "New allocation #" << allocation->index() << " for constant "
+                << *buffer;
+      }
       continue;
     }
 
-    const int64 buffer_size = assignment->buffer_size_(*buffer);
-
     const bool is_entry_parameter =
         instruction->opcode() == HloOpcode::kParameter &&
         computation == computation->parent()->entry_computation();
@@ -923,9 +935,7 @@ Status BufferAssigner::AssignBuffersForComputation(
       // computations do not need special allocations because they live inside
       // callers.
       BufferAllocation* allocation =
-          assignment->NewAllocation(*buffer, buffer_size,
-                                    /*is_thread_local=*/false,
-                                    /*is_reusable=*/false);
+          assignment->NewAllocation(*buffer, buffer_size);
       allocation->set_entry_computation_parameter(
           instruction->parameter_number(), buffer->index());
       VLOG(3) << "New allocation #" << allocation->index()
@@ -934,20 +944,18 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
 
     if (is_thread_local) {
-      // We do not reuse thread-local buffers for now, because they are
-      // dynamically allocated and their lifetimes are hard to compute.
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
+      allocation->set_is_thread_local(true);
       VLOG(3) << "New allocation #" << allocation->index()
               << " for thread-local: " << *buffer;
       continue;
     }
 
     if (ShapeUtil::IsTuple(buffer->shape())) {
-      // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
-      // assumes longer buffer liveness than indicated by the analysis.
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/false);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
+      allocation->set_is_tuple(true);
       VLOG(3) << "New allocation #" << allocation->index()
               << " for tuple-shaped buffer: " << *buffer;
       continue;
@@ -1030,8 +1038,8 @@ Status BufferAssigner::AssignBuffersForComputation(
     }
 
     if (!assignment->HasAllocation(*buffer)) {
-      BufferAllocation* allocation = assignment->NewAllocation(
-          *buffer, buffer_size, is_thread_local, /*is_reusable=*/true);
+      BufferAllocation* allocation =
+          assignment->NewAllocation(*buffer, buffer_size);
       allocation_indices.push_back(allocation->index());
       VLOG(3) << "New allocation #" << allocation->index()
               << " for: " << *buffer;
@@ -1085,6 +1093,7 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
+      options.alloc_constants = allocate_buffers_for_constants_;
       BufferValueFlatSet buffer_value_set =
           ToBufferValueFlatSet(single_colored_set.second);
       options.buffers_to_assign = &buffer_value_set;
@@ -1227,8 +1236,8 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
         result.fragmentation_size;
   }
 
-  BufferAllocation* allocation = assignment->NewEmptyAllocation(
-      result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
+  BufferAllocation* allocation =
+      assignment->NewEmptyAllocation(result.heap_size, color);
   for (const auto& buffer_chunk : result.chunk_map) {
     // TODO(lauj) Remove this down_cast after downstream users of
     // BufferAllocation::assigned_buffers() are updated to use BufferValue.
@@ -1332,11 +1341,25 @@ BufferAssigner::MergeColocatedBufferSets(
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
                                    &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs or entry parameters.
+    // Do not merge if one of the sets includes live outs, entry parameters or
+    // constants.
+    //
+    // Buffer liveness does not report the correct live range for entry
+    // parameter and live out buffers so we have to special case them here.  On
+    // backends that support constant buffer allocations, constant buffers are
+    // assigned globals in readonly storage so we can't merge colocated buffer
+    // sets containing constants with colocated buffer sets containing writing
+    // instructions or other constants.
+    //
+    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+    // the caller of the executable so we can't write to entry parameters
+    // either, and the argument for not merging constants also applies to entry
+    // parameters.
     for (int64 key : {i, j}) {
       for (auto& buffer : colocated_buffer_sets[key]) {
         if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer)) {
+            is_entry_parameter(*buffer) ||
+            buffer->instruction()->opcode() == HloOpcode::kConstant) {
           return true;
         }
       }
@@ -1566,6 +1589,7 @@ void BufferAssigner::AssignColocatedBufferSets(
     // param in 'colocated_buffer_set'.
     int64 entry_parameter_number = -1;
     const ShapeIndex* entry_parameter_shape_idx = nullptr;
+    bool is_constant = false;
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       const HloInstruction* instruction = buffer->instruction();
       const HloComputation* computation = instruction->parent();
@@ -1573,10 +1597,14 @@ void BufferAssigner::AssignColocatedBufferSets(
           computation == computation->parent()->entry_computation()) {
         entry_parameter_number = instruction->parameter_number();
         entry_parameter_shape_idx = &buffer->index();
-        break;
+      } else if (instruction->opcode() == HloOpcode::kConstant) {
+        is_constant = true;
       }
     }
 
+    CHECK(!is_constant || entry_parameter_number == -1)
+        << "Copy insertion should have inserted copies to prevent this.";
+
     for (const LogicalBuffer* buffer : colocated_buffer_set) {
       const int64 buffer_size = assignment->buffer_size_(*buffer);
       if (allocation == nullptr) {
@@ -1584,18 +1612,14 @@ void BufferAssigner::AssignColocatedBufferSets(
         // allocations for each colocated buffer set. When liveness has
         // module-level scope, we can allow buffers to be shared across
         // computations (in some cases).
-        allocation = assignment->NewAllocation(*buffer, buffer_size,
-                                               /*is_thread_local=*/false,
-                                               /*is_reusable=*/true);
+        allocation = assignment->NewAllocation(*buffer, buffer_size);
         if (entry_parameter_number >= 0) {
-          // This colocated buffer set contains an entry parameter and other
-          // logical buffers which use the parameter as read-only in a while
-          // body computation (which updates in place).
-          // Set 'entry_computation_parameter' to indicate that it contains
-          // an entry parameter, and to prevent reuse in MaybeAssignBuffer.
           allocation->set_entry_computation_parameter(
               entry_parameter_number, *entry_parameter_shape_idx);
         }
+        if (is_constant) {
+          allocation->set_constant(true);
+        }
         colocated_allocations->insert(allocation->index());
       } else {
         CHECK_EQ(buffer_size, allocation->size())
@@ -1653,7 +1677,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       buffers_to_assign_sequentially;
   for (auto* computation : global_computations) {
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, module->config().debug_options(),
+        computation,
         /*is_thread_local=*/false, colocated_buffers, colocated_allocations,
         &buffers_to_assign_sequentially, assignment.get()));
   }
@@ -1674,7 +1698,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
       continue;
     }
     TF_RETURN_IF_ERROR(AssignBuffersForComputation(
-        computation, module->config().debug_options(),
+        computation,
         /*is_thread_local=*/true, colocated_buffers, colocated_allocations,
         /*buffers_to_assign_sequentially=*/nullptr, assignment.get()));
   }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index ad0b0bf7c25d7194a06801e4ef1c9ee961f6b915..94495290c131e22392079dc2d0237d990b646d3e 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
@@ -58,13 +57,8 @@ class BufferAllocation {
   // contiguously and can be used as array indexes.
   using Index = int64;
 
-  BufferAllocation(Index index, int64 size, bool is_thread_local,
-                   bool is_reusable, LogicalBuffer::Color color)
-      : index_(index),
-        size_(size),
-        is_thread_local_(is_thread_local),
-        is_reusable_(is_reusable),
-        color_(color) {}
+  BufferAllocation(Index index, int64 size, LogicalBuffer::Color color)
+      : index_(index), size_(size), color_(color) {}
   ~BufferAllocation() {}
 
   // Returns the index of this allocation.
@@ -74,9 +68,28 @@ class BufferAllocation {
   // inside of a map or reduce computation. Such allocations need to be thread
   // local.
   bool is_thread_local() const { return is_thread_local_; }
+  void set_is_thread_local(bool is_thread_local) {
+    is_thread_local_ = is_thread_local;
+  }
 
   // Whether this allocation can be used by more than one logical buffer.
-  bool is_reusable() const { return is_reusable_; }
+  bool is_reusable() const {
+    // We do not reuse thread-local buffers for now, because they are
+    // dynamically allocated and their lifetimes are hard to compute.
+    //
+    // TODO(b/34669761): Don't reuse tuple buffers because the GPU backend
+    // assumes longer buffer liveness than indicated by the analysis.
+    return !is_thread_local() && !is_tuple();
+  }
+
+  // Whether this allocation is readonly i.e. backed by memory we cannot write
+  // to.
+  bool is_readonly() const {
+    return is_entry_computation_parameter() || is_constant();
+  }
+
+  bool is_tuple() const { return is_tuple_; }
+  void set_is_tuple(bool is_tuple) { is_tuple_ = is_tuple; }
 
   // Whether this allocation holds a LogicalBuffer from a parameter of the entry
   // computation. These buffers have lifetimes which may be longer than the
@@ -84,6 +97,13 @@ class BufferAllocation {
   bool is_entry_computation_parameter() const {
     return is_entry_computation_parameter_;
   }
+
+  // Whether this allocation holds a constant.  On the CPU and GPU backends
+  // constant allocations are not allocated dynamically, instead we resolve
+  // references to these buffer allocations to a global in the readonly section
+  // of the binary.
+  bool is_constant() const { return is_constant_; }
+
   // If this allocation holds a Buffer from a parameter of the entry
   // computation, this methods returns the parameter number. CHECKs otherwise.
   int64 parameter_number() const {
@@ -189,7 +209,9 @@ class BufferAllocation {
            // of the computation.
            !maybe_live_out() &&
            // Thread-local buffers are allocated using `alloca`s.
-           !is_thread_local();
+           !is_thread_local() &&
+           // Constant buffers are allocated as global values.
+           !is_constant();
   }
 
   // Add a heap trace which was used to assign slices to logical buffers in this
@@ -245,6 +267,8 @@ class BufferAllocation {
     parameter_number_ = parameter_number;
     param_shape_index_ = std::move(param_shape_index);
   }
+
+  void set_constant(bool is_constant) { is_constant_ = is_constant; }
   void set_maybe_live_out(bool value) { maybe_live_out_ = value; }
   void set_index(Index index) { index_ = index; }
   void set_size(int64 size) { size_ = size; }
@@ -256,10 +280,10 @@ class BufferAllocation {
   int64 size_;
 
   // Whether this buffer needs to be thread-local.
-  bool is_thread_local_;
+  bool is_thread_local_ = false;
 
-  // Whether this buffer is usable by more than one logical buffer.
-  bool is_reusable_;
+  // Whether this buffer holds a tuple.
+  bool is_tuple_ = false;
 
   // Color of the allocation.
   LogicalBuffer::Color color_;
@@ -283,6 +307,9 @@ class BufferAllocation {
   // might not actually escape.
   bool maybe_live_out_ = false;
 
+  // See comment on the is_constant() accessor.
+  bool is_constant_ = false;
+
   // Mapping from the set of buffers assigned to this allocation to their
   // logical offsets and sizes.
   tensorflow::gtl::FlatMap<const LogicalBuffer*, OffsetSize> assigned_buffers_;
@@ -398,6 +425,8 @@ class BufferAssignment {
   struct Stats {
     int64 parameter_allocation_count = 0;
     int64 parameter_allocation_bytes = 0;
+    int64 constant_allocation_count = 0;
+    int64 constant_allocation_bytes = 0;
     int64 maybe_live_out_allocation_count = 0;
     int64 maybe_live_out_allocation_bytes = 0;
     int64 preallocated_temp_allocation_count = 0;
@@ -426,14 +455,11 @@ class BufferAssignment {
 
   // Creates and returns a new BufferAllocation, with no assigned
   // LogicalBuffers. Ownership is maintained internally.
-  BufferAllocation* NewEmptyAllocation(int64 size, bool is_thread_local,
-                                       bool is_reusable,
-                                       LogicalBuffer::Color color);
+  BufferAllocation* NewEmptyAllocation(int64 size, LogicalBuffer::Color color);
 
   // Helper that calls NewEmptyAllocation and AddAssignment in one call,
   // creating an allocation containing a single LogicalBuffer.
-  BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size,
-                                  bool is_thread_local, bool is_reusable);
+  BufferAllocation* NewAllocation(const LogicalBuffer& buffer, int64 size);
 
   // Adds a LogicalBuffer to the set assigned to the given allocation.
   void AddAssignment(BufferAllocation* allocation, const LogicalBuffer& buffer,
@@ -493,12 +519,15 @@ class BufferAssigner {
       LogicalBuffer::SizeFunction buffer_size,
       LogicalBuffer::AlignmentFunction color_alignment,
       bool allow_input_output_aliasing = false,
+      bool allocate_buffers_for_constants = false,
       BufferLiveness::Colorer colorer = BufferLiveness::DefaultColorer());
 
  private:
   BufferAssigner(bool allow_input_output_aliasing,
+                 bool allocate_buffers_for_constants,
                  BufferLiveness::Colorer colorer)
       : allow_input_output_aliasing_(allow_input_output_aliasing),
+        allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer) {}
   virtual ~BufferAssigner() = default;
 
@@ -513,8 +542,7 @@ class BufferAssigner {
   // true, then all assigned buffers have the is_thread_local flag set to
   // true.
   Status AssignBuffersForComputation(
-      const HloComputation* computation, const DebugOptions& debug_options,
-      bool is_thread_local,
+      const HloComputation* computation, bool is_thread_local,
       const tensorflow::gtl::FlatSet<const LogicalBuffer*>& colocated_buffers,
       const tensorflow::gtl::FlatSet<BufferAllocation::Index>&
           colocated_allocations,
@@ -595,6 +623,9 @@ class BufferAssigner {
   // buffers can be shared if their sizes match.
   bool allow_input_output_aliasing_;
 
+  // If true, allocate buffers for constant instructions.
+  bool allocate_buffers_for_constants_;
+
   // Functor used to assign colors to newly allocated logical buffers.
   BufferLiveness::Colorer colorer_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bfd20921e29126fac8d0330ae7d50a0f7edd65b5..eccb146a0d7d628870be179a540d9750df3fe41c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -89,7 +89,20 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
+        .ConsumeValueOrDie();
+  }
+
+  std::unique_ptr<BufferAssignment> RunBufferAssignmentNoBuffersForConstants(
+      HloModule* module, int64 alignment = 1) {
+    return BufferAssigner::Run(
+               module, xla::MakeUnique<DependencyHloOrdering>(module),
+               backend().compiler()->BufferSizeBytesFunction(),
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/false)
         .ConsumeValueOrDie();
   }
 
@@ -98,8 +111,9 @@ class BufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<DependencyHloOrdering>(module),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; }, false,
-               std::move(colorer))
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true, std::move(colorer))
         .ConsumeValueOrDie();
   }
 
@@ -115,7 +129,9 @@ class BufferAssignmentTest : public HloTestBase {
                module,
                xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
                backend().compiler()->BufferSizeBytesFunction(),
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
 
@@ -294,9 +310,15 @@ TEST_F(BufferAssignmentTest, ScalarConstant) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
-  // Check that the constant does not have a buffer assigned.
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+  {
+    auto buffers = RunBufferAssignment(module.get());
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
+  }
+
+  {
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+  }
 }
 
 TEST_F(BufferAssignmentTest, BufferForConst) {
@@ -312,12 +334,18 @@ TEST_F(BufferAssignmentTest, BufferForConst) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
-  // The two constant nodes have no buffers assigned.
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
-  EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
-  // The add node has an output buffer.
-  GetAssignedOutputAllocation(*buffers, add);
+  {
+    auto buffers = RunBufferAssignment(module.get());
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const0));
+    EXPECT_TRUE(buffers->HasTopLevelAllocation(const1));
+    GetAssignedOutputAllocation(*buffers, add);
+  }
+  {
+    auto buffers = RunBufferAssignmentNoBuffersForConstants(module.get());
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const0));
+    EXPECT_FALSE(buffers->HasTopLevelAllocation(const1));
+    GetAssignedOutputAllocation(*buffers, add);
+  }
 }
 
 TEST_F(BufferAssignmentTest, HasAllocationAt) {
@@ -1196,7 +1224,7 @@ TEST_F(BufferAssignmentTest, ElementOfNestedTupleParameterAsOutput) {
 
 // TODO(b/32248867): Enable when buffer assignment gives allocations to
 // constants.
-TEST_F(BufferAssignmentTest, DISABLED_TupleConstantAsOutput) {
+TEST_F(BufferAssignmentTest, TupleConstantAsOutput) {
   // Test that a tuple constant which is forwarded to the computation output
   // is properly handled.
   auto builder = HloComputation::Builder(TestName());
@@ -1644,6 +1672,66 @@ TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
       nonbcast_buffer->instruction() == condition->parameter_instruction(0));
 }
 
+TEST_F(BufferAssignmentTest, ConstantBuffersAreNotReused) {
+  const char* hlo_text = R"(
+HloModule Module
+
+True {
+  ROOT x.0.1 = f32[] parameter(0)
+}
+
+False {
+  x.0.0 = f32[] parameter(0)
+  ROOT copy.1 = f32[] copy(x.0.0)
+}
+
+ENTRY main {
+  pred.1.0 = pred[] parameter(0)
+  constant.1.1 = f32[] constant(56)
+  copy.2 = f32[] copy(constant.1.1)
+  constant.1.2 = f32[] constant(12)
+  ROOT conditional.1.3 = f32[] conditional(pred.1.0, copy.2, constant.1.2),
+      true_computation=True, false_computation=False
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  HloInstruction* constant_1 =
+      module->entry_computation()->GetInstructionWithName("constant.1.1");
+  HloInstruction* constant_2 =
+      module->entry_computation()->GetInstructionWithName("constant.1.2");
+
+  auto buffers = RunBufferAssignment(module.get());
+
+  {
+    const BufferAllocation& allocation_for_const_1 =
+        GetTopLevelAllocation(*buffers, constant_1);
+    EXPECT_TRUE(allocation_for_const_1.is_constant());
+    for (const auto& buffer_offset_pair :
+         allocation_for_const_1.assigned_buffers()) {
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kCopy);
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kConditional);
+    }
+  }
+
+  {
+    const BufferAllocation& allocation_for_const_2 =
+        GetTopLevelAllocation(*buffers, constant_2);
+    EXPECT_TRUE(allocation_for_const_2.is_constant());
+    for (const auto& buffer_offset_pair :
+         allocation_for_const_2.assigned_buffers()) {
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kCopy);
+      EXPECT_NE(buffer_offset_pair.first->instruction()->opcode(),
+                HloOpcode::kConditional);
+    }
+  }
+}
+
 class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
@@ -1683,7 +1771,9 @@ class WhileBufferAssignmentTest : public HloTestBase {
     return BufferAssigner::Run(
                module, xla::MakeUnique<SequentialHloOrdering>(module, sequence),
                ByteSizeOf,
-               [alignment](LogicalBuffer::Color) { return alignment; })
+               [alignment](LogicalBuffer::Color) { return alignment; },
+               /*allow_input_output_aliasing=*/false,
+               /*allocate_buffers_for_constants=*/true)
         .ConsumeValueOrDie();
   }
 
@@ -1833,6 +1923,74 @@ ENTRY %test_module {
   EXPECT_NE(slice_param, slice_while1);
 }
 
+TEST_F(WhileBufferAssignmentTest, ColocatedBufferWithConstant) {
+  const Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+
+  const char* module_str = R"(
+HloModule test_module
+
+%cond.v0 {
+  %param = s32[] parameter(0)
+  ROOT %constant = pred[] constant(true)
+}
+
+%cond.v1 {
+  %param.0 = s32[] parameter(0)
+  ROOT %constant.0 = pred[] constant(true)
+}
+
+%body.v0 {
+  ROOT %param.1 = s32[] parameter(0)
+}
+
+%body.v1 {
+  %param.2 = s32[] parameter(0)
+  ROOT add = s32[] add(%param.2, %param.2)
+}
+
+ENTRY %test_module {
+  %constant.42 = s32[] constant(42)
+  %while.0 = s32[] while(%constant.42), condition=%cond.v0, body=%body.v0
+  %mul = s32[] multiply(%while.0, %while.0)
+  %while.1 = s32[] while(%mul), condition=%cond.v1, body=%body.v1
+  ROOT %bcast = s32[1024,1024]{1,0} broadcast(s32[] %while.1), dimensions={}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  // Run CopyInsertion and check if the graph constructed above doesn't need
+  // any copies inserted for BufferAssignment to run.
+  int64 instruction_count = module->instruction_count();
+  CopyInsertion copy_insertion;
+  ASSERT_IS_OK(copy_insertion.Run(module.get()).status());
+  ASSERT_EQ(instruction_count, module->instruction_count());
+
+  // Get the instructions in the module.
+  const HloInstruction* bcast = module->entry_computation()->root_instruction();
+  const HloInstruction* constant =
+      module->entry_computation()->GetInstructionWithName("constant.42");
+  ASSERT_EQ(bcast->opcode(), HloOpcode::kBroadcast);
+  const HloInstruction* while1 = bcast->operand(0);
+  ASSERT_EQ(while1->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while0 = while1->operand(0)->operand(0);
+  ASSERT_EQ(while0->opcode(), HloOpcode::kWhile);
+
+  // Run buffer assignment.
+  auto assignment = RunBufferAssignment(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_constant,
+                          assignment->GetUniqueSlice(constant, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while0,
+                          assignment->GetUniqueSlice(while0, {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto slice_while1,
+                          assignment->GetUniqueSlice(while1, {}));
+
+  // The constant slice is part of the while0's colocation set (init value), but
+  // not merged into the while1's colocation set.
+  EXPECT_EQ(slice_constant, slice_while0);
+  EXPECT_NE(slice_constant, slice_while1);
+}
+
 // Tests that the colocated buffers for while instructions are properly assigned
 // during buffer assignment such that the result tuple elements are not assigned
 // to the same buffer.
@@ -1927,7 +2085,9 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
           backend().compiler()->BufferSizeBytesFunction(),
-          [](LogicalBuffer::Color) { return 1; }));
+          [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
 
   // The result tuple elements must be assigned with different buffers.
   TF_ASSERT_OK_AND_ASSIGN(auto slice0, assignment->GetUniqueSlice(tuple, {0}));
@@ -2181,7 +2341,9 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
       BufferAssigner::Run(
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), sequence),
-          ByteSizeOf, [](LogicalBuffer::Color) { return 1; })
+          ByteSizeOf, [](LogicalBuffer::Color) { return 1; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true)
           .ConsumeValueOrDie();
 
   EXPECT_TRUE(BuffersDistinct({while0}, {while1}, *assignment));
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index ca2a78da67978152c997e2ad79a70530d04cc6d3..36fb9b43aa20bad788a0638b4fed6c88fc9023f0 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -351,26 +351,6 @@ Status StripControlDependenciesFrom(HloInstruction* instruction) {
   return Status::OK();
 }
 
-// Add kCopy instructions to the given module to guarantee there is no
-// live-range interference. Generally interference can only occur around kWhile
-// instructions which have update-in-place semantics.
-Status AddCopiesToResolveInterference(HloModule* module) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
-
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kWhile) {
-        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
-      } else if (instruction->opcode() == HloOpcode::kConditional) {
-        TF_RETURN_IF_ERROR(
-            AddCopiesForConditional(*alias_analysis, instruction));
-      }
-    }
-  }
-  return Status::OK();
-}
-
 // Class for removing unnecessary copies from the module.
 //
 // kCopy instructions are added conservatively to guarantee no live range
@@ -945,6 +925,36 @@ class CopyRemover {
   BufferValueTracker buffer_value_tracker_;
 };
 
+void MaybeDumpModule(const string& message, const HloModule& module) {
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << message;
+    XLA_VLOG_LINES(3, module.ToString());
+    hlo_graph_dumper::MaybeDumpHloModule(module, message);
+  }
+}
+
+}  // namespace
+
+// Add kCopy instructions to the given module to guarantee there is no
+// live-range interference. Generally interference can only occur around kWhile
+// instructions which have update-in-place semantics.
+Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
+
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        TF_RETURN_IF_ERROR(AddCopiesForWhile(*alias_analysis, instruction));
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        TF_RETURN_IF_ERROR(
+            AddCopiesForConditional(*alias_analysis, instruction));
+      }
+    }
+  }
+  return Status::OK();
+}
+
 // Add copies to address special constraints on the roots of computations not
 // related to live range interference:
 //
@@ -955,9 +965,10 @@ class CopyRemover {
 //
 //    (3) Constants and parameters cannot be live out of the entry computation
 //
-Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
+Status CopyInsertion::AddSpecialCaseCopies(const CallGraph& call_graph,
+                                           HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
 
   // Identify which shape indices of which instructions need to be copied. Store
   // these results in 'instructions_to_copy'.
@@ -1065,32 +1076,20 @@ Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module) {
   return Status::OK();
 }
 
-Status VerifyNoLiveRangeInterference(HloModule* module) {
+Status CopyInsertion::VerifyNoLiveRangeInterference(HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
   DependencyHloOrdering ordering(module);
   TF_RET_CHECK(!alias_analysis->HasLiveRangeInterference(ordering));
   return Status::OK();
 }
 
-void MaybeDumpModule(const string& message, const HloModule& module) {
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << message;
-    XLA_VLOG_LINES(3, module.ToString());
-    hlo_graph_dumper::MaybeDumpHloModule(module, message);
-  }
-}
-
-}  // namespace
-
-Status RemoveUnnecessaryCopies(
-    const HloOrdering& ordering, HloModule* module,
-    const HloDataflowAnalysis::FusionCanShareBufferFunction&
-        fusion_can_share_buffer) {
+Status CopyInsertion::RemoveUnnecessaryCopies(const HloOrdering& ordering,
+                                              HloModule* module) {
   MaybeDumpModule("after adding copies to resolve interference", *module);
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
-                      HloAliasAnalysis::Run(module, fusion_can_share_buffer));
+                      HloAliasAnalysis::Run(module, fusion_can_share_buffer_));
   CopyRemover copy_remover(*alias_analysis, ordering, module);
   XLA_VLOG_LINES(3, copy_remover.ToString());
 
diff --git a/tensorflow/compiler/xla/service/copy_insertion.h b/tensorflow/compiler/xla/service/copy_insertion.h
index e1973db928423cb4bbad00fe34329f731b23ea09..5ba64b78a3c9aff5f323691df2ece9b5e6bf3232 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.h
+++ b/tensorflow/compiler/xla/service/copy_insertion.h
@@ -71,20 +71,26 @@ class CopyInsertion : public HloPassInterface {
   // TODO(b/62548313): Remove this when buffer assignment is module-scoped.
   static StatusOr<bool> AddCopiesForBufferAssignment(HloModule* module);
 
+  // Try to remove as many copies from the module as possible without
+  // introducing live range interference. Only copy instructions that are
+  // eligible for copy elision are considered for removal.
+  Status RemoveUnnecessaryCopies(const HloOrdering& ordering,
+                                 HloModule* module);
+
  private:
+  // Verifies that no HLO values have interfering live ranged assuming the
+  // ordering used by copy insertion.
+  Status VerifyNoLiveRangeInterference(HloModule* module);
+
+  Status AddCopiesToResolveInterference(HloModule* module);
+
+  Status AddSpecialCaseCopies(const CallGraph& call_graph, HloModule* module);
+
   // Backend specific function that decides whether a fusion can share buffer
   // with its operand.
   HloDataflowAnalysis::FusionCanShareBufferFunction fusion_can_share_buffer_;
 };
 
-// Try to remove as many copies from the module as possible without introducing
-// live range interference. Only copy instructions that are eligible for
-// copy elision are considered for removal.
-Status RemoveUnnecessaryCopies(
-    const HloOrdering& ordering, HloModule* module,
-    const HloDataflowAnalysis::FusionCanShareBufferFunction&
-        fusion_can_share_buffer = nullptr);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COPY_INSERTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index ace9f96cfb70e487476dd42b14fa241b70f80634..504b61d134a0099d055d0266408e1dfb94af5b2a 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -252,6 +252,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
@@ -363,8 +364,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     deps = [
         ":vector_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:math_ops",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:transform_utils",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 29fa29d33ad62a76191cef2de22ccc094b0cf35b..b49ea898962e437ec80dca0deec3aba70556b0dd 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -562,7 +562,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
       BufferAssigner::Run(
           module.get(),
           xla::MakeUnique<SequentialHloOrdering>(module.get(), module_sequence),
-          BufferSizeBytesFunction(), memory_alignment));
+          BufferSizeBytesFunction(), memory_alignment,
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::ToString() includes a header, so no need for us to
   // print one ourselves.
   XLA_VLOG_LINES(2, assignment->ToString());
@@ -584,6 +586,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        std::move(computation_to_profile_idx),
                        &target_machine_features);
 
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
   for (auto embedded_computation :
        entry_computation->MakeEmbeddedComputationsList()) {
     if (embedded_computation->IsFusionComputation()) {
@@ -747,7 +751,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
         BufferAssigner::Run(
             module,
             xla::MakeUnique<SequentialHloOrdering>(module, module_sequence),
-            BufferSizeBytesFunction(), memory_alignment));
+            BufferSizeBytesFunction(), memory_alignment,
+            /*allow_input_output_aliasing=*/false,
+            /*allocate_buffers_for_constants=*/true));
     // BufferAssignment::ToString() includes a header, so no need for us to
     // print one ourselves.
     XLA_VLOG_LINES(2, assignment->ToString());
@@ -776,6 +782,9 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features);
+
+    TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
          computation->MakeEmbeddedComputationsList()) {
@@ -832,7 +841,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     BufferSizes buffer_sizes;
     for (const BufferAllocation& allocation : assignment->Allocations()) {
       // Callers don't need to allocate temporary buffers for parameters.
-      if (allocation.is_entry_computation_parameter()) {
+      if (allocation.is_entry_computation_parameter() ||
+          allocation.is_constant()) {
         buffer_sizes.push_back(-1);
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 1093559892ddb9c238fd9c1f7e3d419ec7022776..81e17a5cd4de7151217ba0f2710c49546bce1f10 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -88,6 +88,11 @@ Status CpuExecutable::AllocateBuffers(
       continue;
     }
 
+    if (allocation.is_constant()) {
+      VLOG(3) << "allocation #" << i << " is a constant";
+      continue;
+    }
+
     if (allocation.is_thread_local()) {
       VLOG(3) << "buffer #" << i << " is thread-local";
       continue;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 58228180ca55ede50c8579bbd73cfdfffc07e208..645888de783e4025cffd6fa4835e60b84bbd7d99 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -49,15 +49,15 @@ class MemoryTile {
   // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
   // `major_dim_offset` in the major dimension.  The tile size along the minor
   // dimension is the vector size, and that is implicitly determined by `vsl`.
-  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* b,
              llvm::Value* matrix, int64 matrix_size_along_minor_dim,
              llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl), ir_builder_(ir_builder) {
+      : vsl_(vsl), b_(b) {
     pointers_.reserve(tile_size_along_major_dim);
     for (int64 i = 0; i < tile_size_along_major_dim; i++) {
-      llvm::Value* total_offset = ir_builder->CreateMul(
-          ir_builder->getInt64(matrix_size_along_minor_dim),
-          ir_builder->CreateAdd(ir_builder->getInt64(i), major_dim_offset));
+      llvm::Value* total_offset =
+          b->CreateMul(b->getInt64(matrix_size_along_minor_dim),
+                       b->CreateAdd(b->getInt64(i), major_dim_offset));
       pointers_.push_back(vsl_->ComputeOffsetPointer(matrix, total_offset));
     }
   }
@@ -101,8 +101,7 @@ class MemoryTile {
     for (int64 i = 0; i < pointers_.size(); i++) {
       for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
         result[i].push_back(vsl_->LoadBroadcast(
-            pointers_[i], ir_builder_->CreateAdd(minor_dim_offset,
-                                                 ir_builder_->getInt64(j))));
+            pointers_[i], b_->CreateAdd(minor_dim_offset, b_->getInt64(j))));
       }
     }
     return result;
@@ -110,7 +109,7 @@ class MemoryTile {
 
  private:
   VectorSupportLibrary* vsl_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   std::vector<llvm::Value*> pointers_;
 };
 
@@ -249,16 +248,15 @@ class ColumnMajorMatrixVectorProductEmitter
   ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
                                         llvm::Value* rhs, llvm::Value* addend,
                                         llvm::Value* result,
-                                        llvm::IRBuilder<>* ir_builder)
+                                        llvm::IRBuilder<>* b)
       : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(),
-             ir_builder_, "") {
+        b_(b),
+        ksl_(b_),
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(), b_, "") {
     CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
     CHECK(!has_addend() || addend != nullptr);
   }
@@ -272,7 +270,7 @@ class ColumnMajorMatrixVectorProductEmitter
                          bool is_first_column);
 
   MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
-    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
                       /*matrix_size_along_minor_dim=*/m(),
                       /*major_dim_offset=*/column_start,
                       /*tile_size_along_major_dim=*/column_count);
@@ -302,7 +300,7 @@ class ColumnMajorMatrixVectorProductEmitter
   llvm::Value* rhs_;
   llvm::Value* addend_;
   llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
   VectorSupportLibrary vsl_;
 };
@@ -331,7 +329,7 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() {
                      });
 
   if (column_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder,
+    EmitOuterLoopBody(b_->getInt64(column_limit), column_remainder,
                       column_limit == 0);
   }
 }
@@ -364,7 +362,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
     return;
   }
 
-  llvm::Value* columns_llvm = ir_builder_->getInt64(columns);
+  llvm::Value* columns_llvm = b_->getInt64(columns);
 
   // for (col = current_tile_col; col < (columns + current_tile_col); col++)
   //   for (row = row_start, row < m_; row++) {
@@ -375,12 +373,11 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 
   ksl_.ForReturnVoid(
       "dot.inner.epilg.outer", /*start=*/current_tile_col,
-      /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col),
+      /*end=*/b_->CreateAdd(columns_llvm, current_tile_col),
       /*step=*/1, /*peel_first_iteration=*/false,
       [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
         llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
-        llvm::Value* total_offset =
-            ir_builder_->CreateMul(col, ir_builder_->getInt64(m()));
+        llvm::Value* total_offset = b_->CreateMul(col, b_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
             vsl_.ComputeOffsetPointer(lhs_, total_offset);
         ksl_.ForReturnVoid(
@@ -388,9 +385,8 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
             /*step=*/1, [&](llvm::Value* scalar_row) {
               llvm::Value* product = vsl_.Mul(
                   vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
-              llvm::Value* setting_result_first_time = ir_builder_->CreateAnd(
-                  is_first_scalar_col,
-                  ir_builder_->getInt1(is_first_tiled_column));
+              llvm::Value* setting_result_first_time = b_->CreateAnd(
+                  is_first_scalar_col, b_->getInt1(is_first_tiled_column));
               ksl_.IfReturnVoid(
                   setting_result_first_time,
                   /*true_block_generator=*/
@@ -478,16 +474,15 @@ class RowMajorMatrixVectorProductEmitter
 
   RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
                                      llvm::Value* rhs, llvm::Value* addend,
-                                     llvm::Value* result,
-                                     llvm::IRBuilder<>* ir_builder)
+                                     llvm::Value* result, llvm::IRBuilder<>* b)
       : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_),
-        vsl_(scalar_type(), /*vector_size=*/tile_cols(), ir_builder_, "") {
+        b_(b),
+        ksl_(b_),
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), b_, "") {
     CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
     CHECK(!has_addend() || addend != nullptr);
   }
@@ -498,7 +493,7 @@ class RowMajorMatrixVectorProductEmitter
 
  private:
   MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
-    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+    return MemoryTile(&vsl_, b_, /*matrix=*/lhs_,
                       /*matrix_size_along_minor_dim=*/k(),
                       /*major_dim_offset=*/row_start,
                       /*tile_size_along_major_dim=*/row_count);
@@ -517,7 +512,7 @@ class RowMajorMatrixVectorProductEmitter
   llvm::Value* rhs_;
   llvm::Value* addend_;
   llvm::Value* result_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
   VectorSupportLibrary vsl_;
 };
@@ -559,7 +554,7 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
   for (int i = 0; i < row_count; i++) {
     llvm::Value* result_value =
         vsl_.Add(horizontal_sums[i], scalar_accumulators[i].Get());
-    llvm::Value* offset = ir_builder_->CreateAdd(ir_builder_->getInt64(i), row);
+    llvm::Value* offset = b_->CreateAdd(b_->getInt64(i), row);
     if (addend_ && row_count != vsl_.vector_size()) {
       result_value = vsl_.Add(vsl_.LoadScalar(addend_, offset), result_value);
     }
@@ -578,7 +573,7 @@ void RowMajorMatrixVectorProductEmitter::Emit() {
       [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
 
   if (row_remainder != 0) {
-    EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
+    EmitOuterLoopBody(b_->getInt64(row_limit), row_remainder);
   }
 }
 
@@ -609,9 +604,8 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   }
 
   for (int r = 0; r < rows; r++) {
-    llvm::Value* total_offset = ir_builder_->CreateMul(
-        ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row),
-        ir_builder_->getInt64(k()));
+    llvm::Value* total_offset = b_->CreateMul(
+        b_->CreateAdd(b_->getInt64(r), current_tile_row), b_->getInt64(k()));
     llvm::Value* lhs_base_pointer =
         vsl_.ComputeOffsetPointer(lhs_, total_offset);
     ksl_.ForReturnVoid(
@@ -722,13 +716,13 @@ class MatrixMatrixBlockPanelEmitter {
   // `lhs` with `rhs` and stores the result in `result`.
   explicit MatrixMatrixBlockPanelEmitter(Config config, llvm::Value* lhs,
                                          llvm::Value* rhs, llvm::Value* result,
-                                         llvm::IRBuilder<>* ir_builder)
+                                         llvm::IRBuilder<>* b)
       : lhs_(lhs),
         rhs_(rhs),
         result_(result),
         config_(config),
-        ir_builder_(ir_builder),
-        ksl_(ir_builder_) {
+        b_(b),
+        ksl_(b_) {
     CHECK(max_vectorization_width() > 0 &&
           IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
     CHECK_GT(max_vector_count(), 0);
@@ -761,7 +755,7 @@ class MatrixMatrixBlockPanelEmitter {
                      int64 tile_size_m, llvm::Value* m_start,
                      llvm::Value* m_end);
 
-  llvm::Value* GetInt64(int64 value) { return ir_builder_->getInt64(value); }
+  llvm::Value* GetInt64(int64 value) { return b_->getInt64(value); }
 
   Config config() const { return config_; }
   Dimensions dims() const { return config().dims(); }
@@ -782,7 +776,7 @@ class MatrixMatrixBlockPanelEmitter {
   llvm::Value* result_;
   Config config_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   KernelSupportLibrary ksl_;
 };
 
@@ -804,8 +798,8 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
          current_vectorization_width >= min_vectorization_width()) {
     int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
     if (n_start != n_end) {
-      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width,
-                               ir_builder_, "gebp");
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width, b_,
+                               "gebp");
       HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
       n_start = n_end;
     }
@@ -819,10 +813,9 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   }
 
   if (n_start != dims().n()) {
-    VectorSupportLibrary vsl(scalar_type(), 1, ir_builder_, "gebp");
+    VectorSupportLibrary vsl(scalar_type(), 1, b_, "gebp");
     ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
-      llvm::Value* n_i_next =
-          ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
+      llvm::Value* n_i_next = b_->CreateAdd(n_i, b_->getInt64(1));
       HandleResiduesOnK(&vsl, n_i, n_i_next);
     });
   }
@@ -935,11 +928,11 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
   ksl_.ForReturnVoid(
       "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
         MemoryTile result_memory_tile(
-            vsl, ir_builder_, /*matrix=*/result_,
+            vsl, b_, /*matrix=*/result_,
             /*matrix_size_along_minor_dim=*/dims().n(),
             /*major_dim_offset=*/m_i,
             /*tile_size_along_major_dim=*/tile_size_m);
-        MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_,
+        MemoryTile lhs_memory_tile(vsl, b_, /*matrix=*/lhs_,
                                    /*matrix_size_along_minor_dim=*/dims().k(),
                                    /*major_dim_offset=*/m_i,
                                    /*tile_size_along_major_dim=*/tile_size_m);
@@ -949,8 +942,8 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
                                            result_memory_tile.LoadTile(n_i));
               ksl_.ForReturnVoid(
                   "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-                    MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_,
-                                               dims().n(), k_i, tile_size_k);
+                    MemoryTile rhs_memory_tile(vsl, b_, rhs_, dims().n(), k_i,
+                                               tile_size_k);
                     std::vector<std::vector<llvm::Value*>> lhs_tile =
                         lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
                     std::vector<llvm::Value*> rhs_tile =
@@ -980,7 +973,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
                            const llvm_ir::IrArray& rhs_array,
                            const llvm_ir::IrArray* addend_array,
                            llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* ir_builder,
+                           llvm::IRBuilder<>* b,
                            const HloModuleConfig& hlo_module_config,
                            const TargetMachineFeatures& target_machine_features)
     : dot_(dot),
@@ -989,7 +982,7 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
       rhs_array_(rhs_array),
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
-      ir_builder_(ir_builder),
+      b_(b),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
@@ -997,15 +990,14 @@ DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
     const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
   DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
-                           addend_array, executable_run_options_value,
-                           ir_builder, hlo_module_config,
-                           target_machine_features);
+                           addend_array, executable_run_options_value, b,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -1050,13 +1042,13 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
   }
 
   int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
-  ir_builder_->CreateMemSet(
-      target, ir_builder_->getInt8(0), size_bytes,
+  b_->CreateMemSet(
+      target, b_->getInt8(0), size_bytes,
       target_machine_features_.minimum_alignment_for_allocation(size_bytes));
 
   int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
-          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+          *b_->GetInsertBlock()->getParent(), primitive_type);
 
   int64 tile_size_m, tile_size_k, tile_size_n_in_vector_width;
   std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
@@ -1080,12 +1072,12 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
 
   KernelSupportLibrary::EmitAndCallOutlinedKernel(
       /*enable_fast_math=*/enable_fast_math,
-      /*optimize_for_size=*/optimize_for_size, ir_builder_,
-      config.GetCacheKey(), lhs, rhs, target,
+      /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(), lhs,
+      rhs, target,
       [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
-        MatrixMatrixBlockPanelEmitter gebp_emitter(
-            config, /*lhs=*/lhs, /*rhs=*/rhs,
-            /*result=*/target, ir_builder_);
+        MatrixMatrixBlockPanelEmitter gebp_emitter(config, /*lhs=*/lhs,
+                                                   /*rhs=*/rhs,
+                                                   /*result=*/target, b_);
         gebp_emitter.Emit();
       });
 
@@ -1163,7 +1155,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   const int target_vector_register_element_size =
       target_machine_features_.vector_register_num_elements(
-          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+          *b_->GetInsertBlock()->getParent(), primitive_type);
 
   // We may not always know the vector register size for the target we're
   // compiling against, in which case target_vector_register_element_size is 0.
@@ -1184,13 +1176,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_,
-        config.GetCacheKey(), lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
+        lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
         [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
                        llvm::Value* addend_op, llvm::Value* result_op) {
           ColumnMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
+              config, lhs_op, rhs_op, addend_op, result_op, b_);
           emitter.Emit();
         });
   } else {
@@ -1203,13 +1195,13 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_,
-        config.GetCacheKey(), lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, b_, config.GetCacheKey(),
+        lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
         [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
                        llvm::Value* addend_op, llvm::Value* result_op) {
-          RowMajorMatrixVectorProductEmitter emitter(
-              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
+          RowMajorMatrixVectorProductEmitter emitter(config, lhs_op, rhs_op,
+                                                     addend_op, result_op, b_);
           emitter.Emit();
         });
   }
@@ -1285,11 +1277,11 @@ Status DotOpEmitter::Emit() {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), ir_builder_);
-  llvm_ir::IrArray::Index lhs_index = EmitOperandArrayLoopNest(
-      &loop_nest, lhs_array_, lhs_reduction_dimension, "lhs");
-  llvm_ir::IrArray::Index rhs_index = EmitOperandArrayLoopNest(
-      &loop_nest, rhs_array_, rhs_reduction_dimension, "rhs");
+  llvm_ir::ForLoopNest loop_nest(llvm_ir::IrName(&dot_), b_);
+  llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
+      lhs_array_, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
+  llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
+      rhs_array_, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
 
   // Create the loop which does the sum of products reduction.
   //
@@ -1319,62 +1311,55 @@ Status DotOpEmitter::Emit() {
   // Function entry basic block.
   // - Emit alloca for accumulator
   llvm::Function* func = reduction_loop->GetPreheaderBasicBlock()->getParent();
-  SetToFirstInsertPoint(&func->getEntryBlock(), ir_builder_);
+  SetToFirstInsertPoint(&func->getEntryBlock(), b_);
   llvm::Type* accum_type = target_array_.GetElementLlvmType();
-  llvm::Value* accum_address = ir_builder_->CreateAlloca(
-      accum_type, /*ArraySize=*/nullptr, "accum_address");
+  llvm::Value* accum_address =
+      b_->CreateAlloca(accum_type, /*ArraySize=*/nullptr, "accum_address");
 
   // Preheader basic block of reduction loop:
   // - Initialize accumulator to zero.
   llvm::BasicBlock* preheader_bb = reduction_loop->GetPreheaderBasicBlock();
-  ir_builder_->SetInsertPoint(preheader_bb->getTerminator());
+  b_->SetInsertPoint(preheader_bb->getTerminator());
 
-  ir_builder_->CreateStore(llvm::Constant::getNullValue(accum_type),
-                           accum_address);
+  b_->CreateStore(llvm::Constant::getNullValue(accum_type), accum_address);
 
   // Body basic block of reduction loop:
   // - Load elements from lhs and rhs array.
   // - Multiply lhs-element and rhs-element.
   // - Load accumulator and add to product.
   // - Store sum back into accumulator.
-  SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetBodyBasicBlock(), b_);
 
-  llvm::Value* lhs_element =
-      lhs_array_.EmitReadArrayElement(lhs_index, ir_builder_);
-  llvm::Value* rhs_element =
-      rhs_array_.EmitReadArrayElement(rhs_index, ir_builder_);
+  llvm::Value* lhs_element = lhs_array_.EmitReadArrayElement(lhs_index, b_);
+  llvm::Value* rhs_element = rhs_array_.EmitReadArrayElement(rhs_index, b_);
 
-  llvm::Value* accum = ir_builder_->CreateLoad(accum_address);
+  llvm::Value* accum = b_->CreateLoad(accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-    auto real = [&](llvm::Value* x) {
-      return ir_builder_->CreateExtractValue(x, {0});
-    };
-    auto imag = [&](llvm::Value* x) {
-      return ir_builder_->CreateExtractValue(x, {1});
-    };
-    llvm::Value* product_real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(real(lhs_element), real(rhs_element)),
-        ir_builder_->CreateFMul(imag(lhs_element), imag(rhs_element)));
-    llvm::Value* product_imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(real(lhs_element), imag(rhs_element)),
-        ir_builder_->CreateFMul(imag(lhs_element), real(rhs_element)));
-    updated_accum = ir_builder_->CreateInsertValue(
-        accum, ir_builder_->CreateFAdd(real(accum), product_real), {0});
-    updated_accum = ir_builder_->CreateInsertValue(
-        updated_accum, ir_builder_->CreateFAdd(imag(accum), product_imag), {1});
+    auto real = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {0}); };
+    auto imag = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {1}); };
+    llvm::Value* product_real =
+        b_->CreateFSub(b_->CreateFMul(real(lhs_element), real(rhs_element)),
+                       b_->CreateFMul(imag(lhs_element), imag(rhs_element)));
+    llvm::Value* product_imag =
+        b_->CreateFAdd(b_->CreateFMul(real(lhs_element), imag(rhs_element)),
+                       b_->CreateFMul(imag(lhs_element), real(rhs_element)));
+    updated_accum = b_->CreateInsertValue(
+        accum, b_->CreateFAdd(real(accum), product_real), {0});
+    updated_accum = b_->CreateInsertValue(
+        updated_accum, b_->CreateFAdd(imag(accum), product_imag), {1});
   } else {
-    llvm::Value* product = ir_builder_->CreateFMul(lhs_element, rhs_element);
-    updated_accum = ir_builder_->CreateFAdd(accum, product);
+    llvm::Value* product = b_->CreateFMul(lhs_element, rhs_element);
+    updated_accum = b_->CreateFAdd(accum, product);
   }
-  ir_builder_->CreateStore(updated_accum, accum_address);
+  b_->CreateStore(updated_accum, accum_address);
 
   // Exit basic block of reduction loop.
   // - Load accumulator value (the result).
   // - Store into output array.
-  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), b_);
 
-  llvm::Value* result = ir_builder_->CreateLoad(accum_address);
+  llvm::Value* result = b_->CreateLoad(accum_address);
 
   // Create index into target address. The target index is the concatenation of
   // the rhs and lhs indexes with the reduction dimensions removed. The terms
@@ -1392,11 +1377,11 @@ Status DotOpEmitter::Emit() {
     }
   }
 
-  target_array_.EmitWriteArrayElement(target_index, result, ir_builder_);
+  target_array_.EmitWriteArrayElement(target_index, result, b_);
 
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop.
-  ir_builder_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+  b_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
   return Status::OK();
 }
@@ -1405,31 +1390,30 @@ Status DotOpEmitter::EmitScalarDot() {
   // A scalar dot is just a scalar multiply.
   llvm::Value* result;
   // Use the same index_type for all tensor accesses in the same kernel.
-  llvm::Type* index_type = ir_builder_->getInt64Ty();
+  llvm::Type* index_type = b_->getInt64Ty();
   llvm_ir::IrArray::Index element_index(index_type);
   llvm::Value* lhs_value =
-      lhs_array_.EmitReadArrayElement(/*index=*/element_index, ir_builder_);
+      lhs_array_.EmitReadArrayElement(/*index=*/element_index, b_);
   llvm::Value* rhs_value =
-      rhs_array_.EmitReadArrayElement(/*index=*/element_index, ir_builder_);
+      rhs_array_.EmitReadArrayElement(/*index=*/element_index, b_);
   if (ShapeUtil::ElementIsComplex(lhs_array_.GetShape())) {
-#define REAL(x) ir_builder_->CreateExtractValue(x, {0})
-#define IMAG(x) ir_builder_->CreateExtractValue(x, {1})
-    llvm::Value* real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(REAL(lhs_value), REAL(rhs_value)),
-        ir_builder_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value)));
-    llvm::Value* imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)),
-        ir_builder_->CreateFMul(IMAG(lhs_value), REAL(rhs_value)));
+#define REAL(x) b_->CreateExtractValue(x, {0})
+#define IMAG(x) b_->CreateExtractValue(x, {1})
+    llvm::Value* real =
+        b_->CreateFSub(b_->CreateFMul(REAL(lhs_value), REAL(rhs_value)),
+                       b_->CreateFMul(IMAG(lhs_value), IMAG(rhs_value)));
+    llvm::Value* imag =
+        b_->CreateFAdd(b_->CreateFMul(REAL(lhs_value), IMAG(rhs_value)),
+                       b_->CreateFMul(IMAG(lhs_value), REAL(rhs_value)));
 #undef IMAG
 #undef REAL
     result = llvm::ConstantAggregateZero::get(lhs_array_.GetElementLlvmType());
-    result = ir_builder_->CreateInsertValue(result, real, {0});
-    result = ir_builder_->CreateInsertValue(result, imag, {1});
+    result = b_->CreateInsertValue(result, real, {0});
+    result = b_->CreateInsertValue(result, imag, {1});
   } else {
-    result = ir_builder_->CreateFMul(lhs_value, rhs_value);
+    result = b_->CreateFMul(lhs_value, rhs_value);
   }
-  target_array_.EmitWriteArrayElement(/*index=*/element_index, result,
-                                      ir_builder_);
+  target_array_.EmitWriteArrayElement(/*index=*/element_index, result, b_);
   return Status::OK();
 }
 
@@ -1452,7 +1436,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
       fn_name = multi_threaded
                     ? runtime::kEigenMatMulF16SymbolName
                     : runtime::kEigenSingleThreadedMatMulF16SymbolName;
-      float_type = ir_builder_->getHalfTy();
+      float_type = b_->getHalfTy();
       break;
     case F32:
       fn_name = multi_threaded
@@ -1461,7 +1445,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
                     : (use_mkl_dnn
                            ? runtime::kMKLSingleThreadedMatMulF32SymbolName
                            : runtime::kEigenSingleThreadedMatMulF32SymbolName);
-      float_type = ir_builder_->getFloatTy();
+      float_type = b_->getFloatTy();
       break;
     case F64:
       fn_name = multi_threaded
@@ -1470,7 +1454,7 @@ Status DotOpEmitter::EmitCallToRuntime() {
                     : (use_mkl_dnn
                            ? runtime::kMKLSingleThreadedMatMulF64SymbolName
                            : runtime::kEigenSingleThreadedMatMulF64SymbolName);
-      float_type = ir_builder_->getDoubleTy();
+      float_type = b_->getDoubleTy();
       break;
     default:
       return Unimplemented("Invalid type %s for dot operation",
@@ -1478,16 +1462,16 @@ Status DotOpEmitter::EmitCallToRuntime() {
   }
 
   llvm::Type* float_ptr_type = float_type->getPointerTo();
-  llvm::Type* int64_type = ir_builder_->getInt64Ty();
-  llvm::Type* int32_type = ir_builder_->getInt32Ty();
-  llvm::Type* int8_ptr_type = ir_builder_->getInt8Ty()->getPointerTo();
+  llvm::Type* int64_type = b_->getInt64Ty();
+  llvm::Type* int32_type = b_->getInt32Ty();
+  llvm::Type* int8_ptr_type = b_->getInt8Ty()->getPointerTo();
   llvm::FunctionType* matmul_type = llvm::FunctionType::get(
-      ir_builder_->getVoidTy(),
+      b_->getVoidTy(),
       {int8_ptr_type, float_ptr_type, float_ptr_type, float_ptr_type,
        int64_type, int64_type, int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
-  llvm::Function* function = ir_builder_->GetInsertBlock()->getParent();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
   llvm::Module* module = function->getParent();
 
   llvm::Function* matmul_func = llvm::cast<llvm::Function>(
@@ -1522,18 +1506,15 @@ Status DotOpEmitter::EmitCallToRuntime() {
     std::swap(transpose_lhs, transpose_rhs);
   }
 
-  ir_builder_->CreateCall(
+  b_->CreateCall(
       matmul_func,
-      {ir_builder_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
-       ir_builder_->CreateBitCast(target_array_.GetBasePointer(),
-                                  float_ptr_type),
-       ir_builder_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
-       ir_builder_->getInt64(mat_mult_dims.m),
-       ir_builder_->getInt64(mat_mult_dims.n),
-       ir_builder_->getInt64(mat_mult_dims.k),
-       ir_builder_->getInt32(transpose_lhs),
-       ir_builder_->getInt32(transpose_rhs)});
+      {b_->CreateBitCast(executable_run_options_value_, int8_ptr_type),
+       b_->CreateBitCast(target_array_.GetBasePointer(), float_ptr_type),
+       b_->CreateBitCast(lhs->GetBasePointer(), float_ptr_type),
+       b_->CreateBitCast(rhs->GetBasePointer(), float_ptr_type),
+       b_->getInt64(mat_mult_dims.m), b_->getInt64(mat_mult_dims.n),
+       b_->getInt64(mat_mult_dims.k), b_->getInt32(transpose_lhs),
+       b_->getInt32(transpose_rhs)});
   return Status::OK();
 }
 
@@ -1556,36 +1537,6 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
       LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
-llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
-    llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
-    int64 reduction_dimension, tensorflow::StringPiece name_suffix) {
-  // Prepares the dimension list we will use to emit the loop nest. Outermost
-  // loops are added first. Add loops in major-to-minor order, and skip the
-  // reduction dimension.
-  std::vector<int64> dimensions;
-  const Shape& shape = operand_array.GetShape();
-  for (int i = LayoutUtil::MinorToMajor(shape).size() - 1; i >= 0; --i) {
-    int64 dimension = LayoutUtil::Minor(shape.layout(), i);
-    if (dimension != reduction_dimension) {
-      dimensions.push_back(dimension);
-    }
-  }
-
-  // Create loop nest with one for-loop for each dimension of the
-  // output.
-  llvm_ir::IrArray::Index index =
-      loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
-  // Verify every dimension except the reduction dimension was set in the index.
-  for (int dimension = 0; dimension < index.size(); ++dimension) {
-    if (dimension == reduction_dimension) {
-      DCHECK_EQ(nullptr, index[dimension]);
-    } else {
-      DCHECK_NE(nullptr, index[dimension]);
-    }
-  }
-  return index;
-}
-
 // Return whether the given shape is a matrix with no padding.
 static bool IsRank2WithNoPadding(const Shape& shape) {
   return ShapeUtil::Rank(shape) == 2 && !LayoutUtil::IsPadded(shape);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index ed2a18976a0f1a88e7bb4632d3a63167d5c146ad..590032fbe907d7ca90bf69b7ccc3170b8efec72e 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -61,7 +61,7 @@ class DotOpEmitter {
       const HloInstruction& dot, const llvm_ir::IrArray& target_array,
       const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
       const llvm_ir::IrArray* addend_array,
-      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
+      llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
       const HloModuleConfig& hlo_module_config,
       const TargetMachineFeatures& target_machine_features);
 
@@ -70,8 +70,7 @@ class DotOpEmitter {
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
                const llvm_ir::IrArray* addend_array,
-               llvm::Value* executable_run_options_value,
-               llvm::IRBuilder<>* ir_builder,
+               llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
                const HloModuleConfig& hlo_module_config,
                const TargetMachineFeatures& target_machine_features);
 
@@ -89,17 +88,6 @@ class DotOpEmitter {
   // Emits a call to the CPU runtime to perform the matrix multiply.
   Status EmitCallToRuntime();
 
-  // Emits a series of nested loops for iterating over an operand array in the
-  // dot operation. Loops are constructed in major to minor dimension layout
-  // order. No loop is emitted for the given reduction_dimension. The function
-  // returns an IrArray index for the given operand_array containing the indvars
-  // of the loops. All dimensions of the index are filled except for the
-  // reduction dimension. name_suffix is the string to append to the names of
-  // LLVM constructs (eg, basic blocks) constructed by this method.
-  llvm_ir::IrArray::Index EmitOperandArrayLoopNest(
-      llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
-      int64 reduction_dimension, tensorflow::StringPiece name_suffix);
-
   // Represents the dimensions of a matrix-matrix multiply operation.
   struct MatMultDims {
     // The number of rows in the LHS.
@@ -171,7 +159,7 @@ class DotOpEmitter {
   const llvm_ir::IrArray& rhs_array_;
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e97113dfa0f59e791d614c0093d0781e49c48ee4..cf955a8add394c204673be0746a451d4edcadc96 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -38,8 +38,7 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
       switch (element_type) {
         case F16:
           cast_result_to_fp16 = true;
-          operand_value = ir_builder_->CreateFPCast(operand_value,
-                                                    ir_builder_->getFloatTy());
+          operand_value = b_->CreateFPCast(operand_value, b_->getFloatTy());
           TF_FALLTHROUGH_INTENDED;
         case F32:
           function_name = "tanhf";
@@ -59,9 +58,9 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitFloatUnaryOp(
       function->setDoesNotThrow();
       function->setDoesNotAccessMemory();
       // Create an instruction to call the function.
-      llvm::Value* result = ir_builder_->CreateCall(function, operand_value);
+      llvm::Value* result = b_->CreateCall(function, operand_value);
       if (cast_result_to_fp16) {
-        result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+        result = b_->CreateFPCast(result, b_->getHalfTy());
       }
       return result;
     }
@@ -77,8 +76,8 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   switch (prim_type) {
     case F16:
       cast_result_to_fp16 = true;
-      lhs = ir_builder_->CreateFPCast(lhs, ir_builder_->getFloatTy());
-      rhs = ir_builder_->CreateFPCast(rhs, ir_builder_->getFloatTy());
+      lhs = b_->CreateFPCast(lhs, b_->getFloatTy());
+      rhs = b_->CreateFPCast(rhs, b_->getFloatTy());
       TF_FALLTHROUGH_INTENDED;
     case F32:
       function_name = "atan2f";
@@ -98,9 +97,9 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitAtan2(
   function->setDoesNotThrow();
   function->setDoesNotAccessMemory();
   // Create an instruction to call the function.
-  llvm::Value* result = ir_builder_->CreateCall(function, {lhs, rhs});
+  llvm::Value* result = b_->CreateCall(function, {lhs, rhs});
   if (cast_result_to_fp16) {
-    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+    result = b_->CreateFPCast(result, b_->getHalfTy());
   }
   return result;
 }
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index 4446dfd2821fb4b6e75f33694367392ecbcdd8bf..9598a886ab49fcecf5df7bd65f425fe485de3574 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -31,7 +31,7 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   CpuElementalIrEmitter(const HloModuleConfig& module_config,
                         IrEmitter* ir_emitter, llvm::Module* module)
-      : ElementalIrEmitter(module_config, module, ir_emitter->ir_builder()),
+      : ElementalIrEmitter(module_config, module, ir_emitter->b()),
         ir_emitter_(ir_emitter) {}
 
   llvm_ir::ElementGenerator MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 05f431642c0d5ccc44d614eb1bfe6fadaa287601..a6d8551841dcba8b81e257f3deb2aacf9b8aff4a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -89,14 +90,14 @@ IrEmitter::IrEmitter(
     : assignment_(assignment),
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
-      ir_builder_(llvm_module->getContext()),
+      b_(llvm_module->getContext()),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
       target_machine_features_(*target_machine_features) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
           .xla_enable_fast_math()));
 }
@@ -146,7 +147,7 @@ void IrEmitter::InitializeIrFunction(const string& function_name) {
       new IrFunction(function_name, linkage,
                      options::OptimizeForSizeRequested(hlo_module_config_),
                      hlo_module_config_.debug_options().xla_enable_fast_math(),
-                     module_, &ir_builder_, num_dynamic_loop_bounds_));
+                     module_, &b_, num_dynamic_loop_bounds_));
 }
 
 IrEmitter::~IrEmitter() {}
@@ -154,9 +155,9 @@ IrEmitter::~IrEmitter() {}
 Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   VLOG(2) << "HandleBitcast: " << bitcast->ToString();
   emitted_value_[bitcast] =
-      ir_builder_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)),
-                                IrShapeType(bitcast->shape())->getPointerTo(),
-                                AsStringRef(IrName(bitcast)));
+      b_.CreateBitCast(GetEmittedValueFor(bitcast->operand(0)),
+                       IrShapeType(bitcast->shape())->getPointerTo(),
+                       AsStringRef(IrName(bitcast)));
   return Status::OK();
 }
 
@@ -175,25 +176,36 @@ llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
       result_global, IrShapeType(literal.shape())->getPointerTo());
 }
 
-Status IrEmitter::HandleConstant(HloInstruction* constant) {
-  VLOG(2) << "HandleConstant: " << constant->ToString();
-  const Literal& literal = constant->literal();
-  llvm::Constant* global_for_const;
+Status IrEmitter::EmitConstantGlobals() {
+  for (const BufferAllocation& allocation : assignment_.Allocations()) {
+    if (!allocation.is_constant()) {
+      continue;
+    }
 
-  auto it = emitted_literals_.find(&literal);
-  if (it != emitted_literals_.end()) {
-    global_for_const = it->second;
-  } else {
-    global_for_const = EmitGlobalForLiteral(literal);
-    emitted_literals_[&literal] = global_for_const;
+    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
+    llvm::Constant* global_for_const;
+    auto it = emitted_literals_.find(&literal);
+    if (it != emitted_literals_.end()) {
+      global_for_const = it->second;
+    } else {
+      global_for_const = EmitGlobalForLiteral(literal);
+      InsertOrDie(&emitted_literals_, &literal, global_for_const);
+    }
+
+    InsertOrDie(&constant_buffer_to_global_, allocation.index(),
+                global_for_const);
   }
-  emitted_value_[constant] = global_for_const;
-  VLOG(2) << "  emitted value: " << llvm_ir::DumpToString(*global_for_const);
-  VLOG(2) << "  its type: "
-          << llvm_ir::DumpToString(*global_for_const->getType());
+
   return Status::OK();
 }
 
+Status IrEmitter::HandleConstant(HloInstruction* constant) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  // IrEmitter::EmitConstantGlobals has already taken care of emitting the body
+  // of the constant.
+  return EmitTargetAddressForOp(constant);
+}
+
 Status IrEmitter::HandleCopy(HloInstruction* copy) {
   if (ShapeUtil::IsTuple(copy->shape())) {
     // kCopy shallow copies a tuple so just memcpy the top-level buffer.
@@ -273,7 +285,7 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const Shape& shape = get_tuple_element->shape();
   emitted_value_[get_tuple_element] = llvm_ir::EmitGetTupleElement(
       shape, get_tuple_element->tuple_index(), MinimumAlignmentForShape(shape),
-      GetEmittedValueFor(operand), &ir_builder_, module_);
+      GetEmittedValueFor(operand), &b_, module_);
   return Status::OK();
 }
 
@@ -293,7 +305,7 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(tuple_select));
   llvm_ir::EmitTupleSelect(GetIrArrayFor(tuple_select), GetIrArrayFor(pred),
                            GetEmittedValueFor(on_true),
-                           GetEmittedValueFor(on_false), &ir_builder_, module_);
+                           GetEmittedValueFor(on_false), &b_, module_);
   return Status::OK();
 }
 
@@ -316,8 +328,8 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
                       assignment_.GetUniqueSlice(infeed, {1}));
   llvm::Value* token_address = EmitTempBufferPointer(
       token_slice, ShapeUtil::GetTupleElementShape(infeed->shape(), 1));
-  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address},
-                     &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(infeed), {data_address, token_address}, &b_,
+                     module_);
 
   if (ShapeUtil::IsTuple(data_shape)) {
     TF_RET_CHECK(!ShapeUtil::IsNestedTuple(data_shape));
@@ -348,7 +360,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
     }
 
     llvm_ir::EmitTuple(llvm_ir::IrArray(data_address, data_shape),
-                       tuple_element_addresses, &ir_builder_, module_);
+                       tuple_element_addresses, &b_, module_);
   } else {
     TF_RETURN_IF_ERROR(
         EmitXfeedTransfer(XfeedKind::kInfeed, data_shape, data_address));
@@ -369,14 +381,14 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   int32 length_32 = static_cast<int32>(length);
 
   int32 shape_length;
-  TF_ASSIGN_OR_RETURN(llvm::Value * shape_ptr,
-                      llvm_ir::EncodeSelfDescribingShapeConstant(
-                          shape, &shape_length, &ir_builder_));
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value * shape_ptr,
+      llvm_ir::EncodeSelfDescribingShapeConstant(shape, &shape_length, &b_));
 
   // The signature of the acquire infeed buffer function is:
   //
   //   (void*)(int32 length);
-  llvm::Type* int32_type = ir_builder_.getInt32Ty();
+  llvm::Type* int32_type = b_.getInt32Ty();
   llvm::Type* i8_ptr_type = llvm::Type::getInt8PtrTy(module_->getContext());
   llvm::FunctionType* acquire_type = llvm::FunctionType::get(
       i8_ptr_type, {int32_type, i8_ptr_type, int32_type},
@@ -396,8 +408,7 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   //
   //   (void)(int32 length, void* buffer);
   llvm::FunctionType* release_type = llvm::FunctionType::get(
-      ir_builder_.getVoidTy(),
-      {int32_type, i8_ptr_type, i8_ptr_type, int32_type},
+      b_.getVoidTy(), {int32_type, i8_ptr_type, i8_ptr_type, int32_type},
       /*isVarArg=*/false);
 
   llvm::Function* release_func;
@@ -414,25 +425,22 @@ Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
   // of size exactly 'length_32', and the runtime is responsible for
   // check-failing the process if there is a mismatch, versus passing us back a
   // buffer that we might overrun.
-  llvm::Value* acquired_pointer = ir_builder_.CreateCall(
-      acquire_func, {ir_builder_.getInt32(length_32), shape_ptr,
-                     ir_builder_.getInt32(shape_length)});
+  llvm::Value* acquired_pointer = b_.CreateCall(
+      acquire_func,
+      {b_.getInt32(length_32), shape_ptr, b_.getInt32(shape_length)});
 
   if (kind == XfeedKind::kInfeed) {
     // Copy to the program buffer address from the acquired buffer.
-    ir_builder_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1,
-                             acquired_pointer,
-                             /*SrcAlign=*/1, length_32);
+    b_.CreateMemCpy(program_buffer_address, /*DstAlign=*/1, acquired_pointer,
+                    /*SrcAlign=*/1, length_32);
   } else {
     // Outfeed -- copy from the in-program address to the acquired buffer.
-    ir_builder_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1,
-                             program_buffer_address,
-                             /*SrcAlign=*/1, length_32);
+    b_.CreateMemCpy(acquired_pointer, /*DstAlign=*/1, program_buffer_address,
+                    /*SrcAlign=*/1, length_32);
   }
 
-  ir_builder_.CreateCall(release_func,
-                         {ir_builder_.getInt32(length_32), acquired_pointer,
-                          shape_ptr, ir_builder_.getInt32(shape_length)});
+  b_.CreateCall(release_func, {b_.getInt32(length_32), acquired_pointer,
+                               shape_ptr, b_.getInt32(shape_length)});
 
   return Status::OK();
 }
@@ -453,7 +461,7 @@ Status IrEmitter::HandleOutfeed(HloInstruction* outfeed) {
         ShapeUtil::GetTupleElementShape(operand_shape, i);
     llvm::Value* tuple_element = llvm_ir::EmitGetTupleElement(
         tuple_element_shape, i, MinimumAlignmentForShape(tuple_element_shape),
-        value, &ir_builder_, module_);
+        value, &b_, module_);
     TF_RETURN_IF_ERROR(EmitXfeedTransfer(XfeedKind::kOutfeed,
                                          tuple_element_shape, tuple_element));
   }
@@ -472,7 +480,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (auto operand : tuple->operands()) {
     base_ptrs.push_back(GetEmittedValueFor(operand));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(tuple), base_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -483,8 +491,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForMap(
   std::vector<llvm::Value*> parameter_addresses;
   for (const HloInstruction* operand : map->operands()) {
     const llvm_ir::IrArray& array = GetIrArrayFor(operand);
-    parameter_addresses.push_back(
-        array.EmitArrayElementAddress(index, &ir_builder_));
+    parameter_addresses.push_back(array.EmitArrayElementAddress(index, &b_));
   }
   return EmitElementFunctionCall(mapped_ir_function, map->shape(),
                                  parameter_addresses, "map_function");
@@ -510,13 +517,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
   PrimitiveType operand_element_type = operand->shape().element_type();
   llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "reduce_window_accumulator_address", &ir_builder_,
+      "reduce_window_accumulator_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
-  ir_builder_.CreateStore(
-      ir_builder_.CreateLoad(GetEmittedValueFor(reduce_window->operand(1))),
-      accumulator_address);
+  b_.CreateStore(b_.CreateLoad(GetEmittedValueFor(reduce_window->operand(1))),
+                 accumulator_address);
 
-  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
   std::vector<int64> window_size;
   for (const auto& dim : window.dimensions()) {
     window_size.push_back(dim.size());
@@ -525,48 +531,47 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduceWindow(
       ShapeUtil::MakeShape(operand_element_type, window_size), "window");
   CHECK_EQ(window_index.size(), index.size());
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
-  llvm_ir::IrArray::Index input_index(ir_builder_.getInt64Ty(), index.size());
+  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), index.size());
   llvm::Value* in_bounds_condition = nullptr;
   for (size_t i = 0; i < index.size(); ++i) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
-    input_index[i] = ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
-        ir_builder_.getInt64(window.dimensions(i).padding_low()));
+    llvm::Value* strided_index =
+        b_.CreateNSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
+    input_index[i] =
+        b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]),
+                        b_.getInt64(window.dimensions(i).padding_low()));
 
     // We need to check if 0 <= input_index[i] < bound, as otherwise we are in
     // the padding so that we can skip the computation. That is equivalent to
     // input_index[i] < bound as an *unsigned* comparison, since a negative
     // value will wrap to a large positive value.
-    llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+    llvm::Value* index_condition = b_.CreateICmpULT(
         input_index[i],
-        ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
+        b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
     if (in_bounds_condition == nullptr) {
       in_bounds_condition = index_condition;
     } else {
-      in_bounds_condition =
-          ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+      in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
     }
   }
   CHECK(in_bounds_condition != nullptr);
 
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
 
   // We are not in the padding, so carry out the computation.
   llvm_ir::IrArray input_array(GetIrArrayFor(operand));
   llvm::Value* input_value_address =
-      input_array.EmitArrayElementAddress(input_index, &ir_builder_);
+      input_array.EmitArrayElementAddress(input_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       reducer_function, reduce_window->shape(),
       {accumulator_address, input_value_address}, "reducer_function");
-  ir_builder_.CreateStore(result, accumulator_address);
+  b_.CreateStore(result, accumulator_address);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(accumulator_address);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(accumulator_address);
 }
 
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
@@ -649,141 +654,127 @@ Status IrEmitter::HandleSelectAndScatter(HloInstruction* select_and_scatter) {
       select_and_scatter, /*desc=*/IrName(select_and_scatter, "init"),
       [this, init_value](const llvm_ir::IrArray::Index& target_index) {
         llvm::Value* init_value_addr = GetEmittedValueFor(init_value);
-        return ir_builder_.CreateLoad(init_value_addr);
+        return b_.CreateLoad(init_value_addr);
       }));
 
   // Create a loop to iterate over the source array to scatter to the output.
-  llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &ir_builder_);
+  llvm_ir::ForLoopNest source_loops(IrName(select_and_scatter), &b_);
   const llvm_ir::IrArray::Index source_index =
       source_loops.AddLoopsForShape(source->shape(), "source");
-  SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(source_loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Allocate space to keep the currently selected value, its index, and
   // the boolean initialized_flag, which is initially set to false.
   llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "selected_value_address", &ir_builder_,
+      "selected_value_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
   llvm::Value* selected_index_address =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder_.getInt64Ty(), ir_builder_.getInt32(rank),
-          "selected_index_address", &ir_builder_);
+          b_.getInt64Ty(), b_.getInt32(rank), "selected_index_address", &b_);
   llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_);
-  ir_builder_.CreateStore(ir_builder_.getInt1(false), initialized_flag_address);
+      b_.getInt1Ty(), "initialized_flag_address", &b_);
+  b_.CreateStore(b_.getInt1(false), initialized_flag_address);
 
   // Create the inner loop to iterate over the window.
-  llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"),
-                                    &ir_builder_);
+  llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "window"), &b_);
   std::vector<int64> window_size;
   for (const auto& dim : window.dimensions()) {
     window_size.push_back(dim.size());
   }
   const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape(
       ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-  SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Compute the operand index to visit and evaluate the condition whether the
   // operand index is within the bounds. The unsigned comparison includes
   // checking whether the operand index >= 0.
-  llvm_ir::IrArray::Index operand_index(ir_builder_.getInt64Ty(),
-                                        source_index.size());
-  llvm::Value* in_bounds_condition = ir_builder_.getTrue();
+  llvm_ir::IrArray::Index operand_index(b_.getInt64Ty(), source_index.size());
+  llvm::Value* in_bounds_condition = b_.getTrue();
   for (int64 i = 0; i < rank; ++i) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        source_index[i], ir_builder_.getInt64(window.dimensions(i).stride()));
-    operand_index[i] = ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
-        ir_builder_.getInt64(window.dimensions(i).padding_low()));
-    llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+    llvm::Value* strided_index = b_.CreateNSWMul(
+        source_index[i], b_.getInt64(window.dimensions(i).stride()));
+    operand_index[i] =
+        b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, window_index[i]),
+                        b_.getInt64(window.dimensions(i).padding_low()));
+    llvm::Value* index_condition = b_.CreateICmpULT(
         operand_index[i],
-        ir_builder_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
-    in_bounds_condition =
-        ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+        b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
+    in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
   }
   CHECK(in_bounds_condition != nullptr);
 
   // Only need to do something if the operand index is within the bounds. First
   // check if the initialized_flag is set.
   llvm_ir::LlvmIfData if_in_bounds =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_);
-  llvm_ir::LlvmIfData if_initialized =
-      llvm_ir::EmitIfThenElse(ir_builder_.CreateLoad(initialized_flag_address),
-                              "initialized", &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_in_bounds.true_block, &b_);
+  llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse(
+      b_.CreateLoad(initialized_flag_address), "initialized", &b_);
 
   // If the initialized_flag is false, initialize the selected value and index
   // with the currently visiting operand.
-  SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
+  SetToFirstInsertPoint(if_initialized.false_block, &b_);
   const auto save_operand_index =
       [&](const llvm_ir::IrArray::Index& operand_index) {
         for (int64 i = 0; i < rank; ++i) {
           llvm::Value* selected_index_address_slot =
-              ir_builder_.CreateInBoundsGEP(selected_index_address,
-                                            {ir_builder_.getInt32(i)});
-          ir_builder_.CreateStore(operand_index[i],
-                                  selected_index_address_slot);
+              b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+          b_.CreateStore(operand_index[i], selected_index_address_slot);
         }
       };
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
-      operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
-  ir_builder_.CreateStore(operand_data, selected_value_address);
+      operand_array.EmitReadArrayElement(operand_index, &b_);
+  b_.CreateStore(operand_data, selected_value_address);
   save_operand_index(operand_index);
-  ir_builder_.CreateStore(ir_builder_.getInt1(true), initialized_flag_address);
+  b_.CreateStore(b_.getInt1(true), initialized_flag_address);
 
   // If the initialized_flag is true, call the `select` function to potentially
   // update the selected value and index with the currently visiting operand.
-  SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_);
+  SetToFirstInsertPoint(if_initialized.true_block, &b_);
   const Shape output_shape = ShapeUtil::MakeShape(PRED, {});
   llvm::Value* operand_address =
-      operand_array.EmitArrayElementAddress(operand_index, &ir_builder_);
+      operand_array.EmitArrayElementAddress(operand_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       select_function, output_shape, {selected_value_address, operand_address},
       "select_function");
 
   // If the 'select' function returns false, update the selected value and the
   // index to the currently visiting operand.
-  llvm::Value* cond = ir_builder_.CreateICmpNE(
+  llvm::Value* cond = b_.CreateICmpNE(
       result,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
       "boolean_predicate");
   llvm_ir::LlvmIfData if_select_lhs =
-      llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_);
-  SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_);
-  ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address),
-                          selected_value_address);
+      llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_);
+  SetToFirstInsertPoint(if_select_lhs.false_block, &b_);
+  b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address);
   save_operand_index(operand_index);
 
   // After iterating over the window elements, scatter the source element to
   // the selected index of the output. The value we store at the output
   // location is computed by calling the `scatter` function with the source
   // value and the current output value.
-  SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(), &b_);
   llvm_ir::IrArray::Index selected_index(source_index.GetType());
   for (int64 i = 0; i < rank; ++i) {
-    llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-        selected_index_address, {ir_builder_.getInt32(i)});
-    selected_index.push_back(
-        ir_builder_.CreateLoad(selected_index_address_slot));
+    llvm::Value* selected_index_address_slot =
+        b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+    selected_index.push_back(b_.CreateLoad(selected_index_address_slot));
   }
   llvm_ir::IrArray source_array(GetIrArrayFor(source));
   llvm::Value* source_value_address =
-      source_array.EmitArrayElementAddress(source_index, &ir_builder_);
+      source_array.EmitArrayElementAddress(source_index, &b_);
   llvm_ir::IrArray output_array(GetIrArrayFor(select_and_scatter));
   llvm::Value* output_value_address =
-      output_array.EmitArrayElementAddress(selected_index, &ir_builder_);
+      output_array.EmitArrayElementAddress(selected_index, &b_);
   llvm::Value* scatter_value = EmitElementFunctionCall(
       scatter_function, source->shape(),
       {output_value_address, source_value_address}, "scatter_function");
-  output_array.EmitWriteArrayElement(selected_index, scatter_value,
-                                     &ir_builder_);
+  output_array.EmitWriteArrayElement(selected_index, scatter_value, &b_);
 
-  SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(source_loops.GetOuterLoopExitBasicBlock(), &b_);
   return Status::OK();
 }
 
@@ -822,7 +813,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
       *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
-      GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+      GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
       target_machine_features_);
 }
 
@@ -849,12 +840,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   llvm::Type* lhs_llvm_type =
       llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_);
   llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      lhs_llvm_type, "convolution_sum_address", &ir_builder_,
+      lhs_llvm_type, "convolution_sum_address", &b_,
       MinimumAlignmentForPrimitiveType(lhs_element_type));
   llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type);
-  ir_builder_.CreateStore(constant_zero, sum_address);
+  b_.CreateStore(constant_zero, sum_address);
 
-  llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_);
   std::vector<llvm::Value*> kernel_spatial(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     kernel_spatial[i] =
@@ -870,7 +861,7 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
                    "iz")
           ->GetIndVarValue();
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Calculate the spatial index in the input array, taking striding, dilation
   // and padding into account. An index in the padding will be out of the bounds
@@ -878,13 +869,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   const auto calculate_input_index = [this](llvm::Value* output_index,
                                             llvm::Value* kernel_index,
                                             const WindowDimension& window_dim) {
-    llvm::Value* strided_index = ir_builder_.CreateNSWMul(
-        output_index, ir_builder_.getInt64(window_dim.stride()));
-    llvm::Value* dilated_kernel_index = ir_builder_.CreateNSWMul(
-        kernel_index, ir_builder_.getInt64(window_dim.window_dilation()));
-    return ir_builder_.CreateNSWSub(
-        ir_builder_.CreateNSWAdd(strided_index, dilated_kernel_index),
-        ir_builder_.getInt64(window_dim.padding_low()));
+    llvm::Value* strided_index =
+        b_.CreateNSWMul(output_index, b_.getInt64(window_dim.stride()));
+    llvm::Value* dilated_kernel_index = b_.CreateNSWMul(
+        kernel_index, b_.getInt64(window_dim.window_dilation()));
+    return b_.CreateNSWSub(b_.CreateNSWAdd(strided_index, dilated_kernel_index),
+                           b_.getInt64(window_dim.padding_low()));
   };
   std::vector<llvm::Value*> input_spatial(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
@@ -901,30 +891,27 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   // Also need to check that the input coordinates are not in one of the
   // holes created by base dilation.
   const auto not_in_hole = [&](llvm::Value* input_index, int64 base_dilation) {
-    llvm::Value* remainder = ir_builder_.CreateSRem(
-        input_index, ir_builder_.getInt64(base_dilation));
-    return ir_builder_.CreateICmpEQ(remainder, ir_builder_.getInt64(0));
+    llvm::Value* remainder =
+        b_.CreateSRem(input_index, b_.getInt64(base_dilation));
+    return b_.CreateICmpEQ(remainder, b_.getInt64(0));
   };
 
-  llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+  llvm::Value* in_bounds_condition = b_.getInt1(true);
   for (int i = 0; i < num_spatial_dims; ++i) {
-    llvm::ConstantInt* input_bound =
-        ir_builder_.getInt64(window_util::DilatedBound(
-            lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
-            window.dimensions(i).base_dilation()));
-    llvm::Value* dim_in_bound =
-        ir_builder_.CreateICmpULT(input_spatial[i], input_bound);
+    llvm::ConstantInt* input_bound = b_.getInt64(window_util::DilatedBound(
+        lhs->shape().dimensions(dnums.input_spatial_dimensions(i)),
+        window.dimensions(i).base_dilation()));
+    llvm::Value* dim_in_bound = b_.CreateICmpULT(input_spatial[i], input_bound);
     llvm::Value* dim_not_in_hole =
         not_in_hole(input_spatial[i], window.dimensions(i).base_dilation());
-    llvm::Value* dim_ok = ir_builder_.CreateAnd(dim_in_bound, dim_not_in_hole);
-    in_bounds_condition = ir_builder_.CreateAnd(in_bounds_condition, dim_ok);
+    llvm::Value* dim_ok = b_.CreateAnd(dim_in_bound, dim_not_in_hole);
+    in_bounds_condition = b_.CreateAnd(in_bounds_condition, dim_ok);
   }
 
   // Now we need to map the dilated base coordinates back to the actual
   // data indices on the lhs.
   const auto undilate = [&](llvm::Value* input_index, int64 base_dilation) {
-    return ir_builder_.CreateSDiv(input_index,
-                                  ir_builder_.getInt64(base_dilation));
+    return b_.CreateSDiv(input_index, b_.getInt64(base_dilation));
   };
   for (int i = 0; i < num_spatial_dims; ++i) {
     input_spatial[i] =
@@ -932,12 +919,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   }
 
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
 
   // We are not in the padding, so carry out the computation.
   int num_dims = num_spatial_dims + 2;
-  llvm_ir::IrArray::Index input_index(ir_builder_.getInt64Ty(), num_dims);
+  llvm_ir::IrArray::Index input_index(b_.getInt64Ty(), num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     input_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
   }
@@ -945,13 +932,12 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   input_index[dnums.input_batch_dimension()] = batch;
 
   llvm_ir::IrArray kernel_array(GetIrArrayFor(rhs));
-  llvm_ir::IrArray::Index kernel_index(ir_builder_.getInt64Ty(), num_dims);
+  llvm_ir::IrArray::Index kernel_index(b_.getInt64Ty(), num_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     kernel_index[dnums.kernel_spatial_dimensions(i)] =
         window.dimensions(i).window_reversal()
-            ? ir_builder_.CreateNSWSub(
-                  ir_builder_.getInt64(window.dimensions(i).size() - 1),
-                  kernel_spatial[i])
+            ? b_.CreateNSWSub(b_.getInt64(window.dimensions(i).size() - 1),
+                              kernel_spatial[i])
             : kernel_spatial[i];
   }
 
@@ -959,15 +945,14 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForConvolution(
   kernel_index[dnums.kernel_output_feature_dimension()] = output_feature;
 
   llvm_ir::IrArray input_array(GetIrArrayFor(lhs));
-  llvm::Value* product = ir_builder_.CreateFMul(
-      input_array.EmitReadArrayElement(input_index, &ir_builder_),
-      kernel_array.EmitReadArrayElement(kernel_index, &ir_builder_));
-  llvm::Value* sum =
-      ir_builder_.CreateFAdd(ir_builder_.CreateLoad(sum_address), product);
-  ir_builder_.CreateStore(sum, sum_address);
+  llvm::Value* product =
+      b_.CreateFMul(input_array.EmitReadArrayElement(input_index, &b_),
+                    kernel_array.EmitReadArrayElement(kernel_index, &b_));
+  llvm::Value* sum = b_.CreateFAdd(b_.CreateLoad(sum_address), product);
+  b_.CreateStore(sum, sum_address);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(sum_address);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(sum_address);
 }
 
 Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
@@ -1056,12 +1041,12 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
 
       PrimitiveType primitive_type = lhs->shape().element_type();
       llvm::Type* ir_ptr_type = primitive_type == F16
-                                    ? ir_builder_.getHalfTy()->getPointerTo()
-                                    : ir_builder_.getFloatTy()->getPointerTo();
-      llvm::Type* int64_type = ir_builder_.getInt64Ty();
-      llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo();
+                                    ? b_.getHalfTy()->getPointerTo()
+                                    : b_.getFloatTy()->getPointerTo();
+      llvm::Type* int64_type = b_.getInt64Ty();
+      llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo();
       llvm::FunctionType* conv_type = llvm::FunctionType::get(
-          ir_builder_.getVoidTy(),
+          b_.getVoidTy(),
           {int8_ptr_type, ir_ptr_type, ir_ptr_type, ir_ptr_type, int64_type,
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
@@ -1093,34 +1078,34 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       conv_func->setCallingConv(llvm::CallingConv::C);
       conv_func->setDoesNotThrow();
       conv_func->setOnlyAccessesArgMemory();
-      ir_builder_.CreateCall(
-          conv_func, {
-                         GetExecutableRunOptionsArgument(),
-                         ir_builder_.CreateBitCast(
-                             GetEmittedValueFor(convolution), ir_ptr_type),
-                         ir_builder_.CreateBitCast(lhs_address, ir_ptr_type),
-                         ir_builder_.CreateBitCast(rhs_address, ir_ptr_type),
-                         ir_builder_.getInt64(input_batch),
-                         ir_builder_.getInt64(input_rows),
-                         ir_builder_.getInt64(input_cols),
-                         ir_builder_.getInt64(input_channels),
-                         ir_builder_.getInt64(kernel_rows),
-                         ir_builder_.getInt64(kernel_cols),
-                         ir_builder_.getInt64(kernel_channels),
-                         ir_builder_.getInt64(kernel_filters),
-                         ir_builder_.getInt64(output_rows),
-                         ir_builder_.getInt64(output_cols),
-                         ir_builder_.getInt64(row_stride),
-                         ir_builder_.getInt64(col_stride),
-                         ir_builder_.getInt64(padding_top),
-                         ir_builder_.getInt64(padding_bottom),
-                         ir_builder_.getInt64(padding_left),
-                         ir_builder_.getInt64(padding_right),
-                         ir_builder_.getInt64(lhs_row_dilation),
-                         ir_builder_.getInt64(lhs_col_dilation),
-                         ir_builder_.getInt64(rhs_row_dilation),
-                         ir_builder_.getInt64(rhs_col_dilation),
-                     });
+      b_.CreateCall(
+          conv_func,
+          {
+              GetExecutableRunOptionsArgument(),
+              b_.CreateBitCast(GetEmittedValueFor(convolution), ir_ptr_type),
+              b_.CreateBitCast(lhs_address, ir_ptr_type),
+              b_.CreateBitCast(rhs_address, ir_ptr_type),
+              b_.getInt64(input_batch),
+              b_.getInt64(input_rows),
+              b_.getInt64(input_cols),
+              b_.getInt64(input_channels),
+              b_.getInt64(kernel_rows),
+              b_.getInt64(kernel_cols),
+              b_.getInt64(kernel_channels),
+              b_.getInt64(kernel_filters),
+              b_.getInt64(output_rows),
+              b_.getInt64(output_cols),
+              b_.getInt64(row_stride),
+              b_.getInt64(col_stride),
+              b_.getInt64(padding_top),
+              b_.getInt64(padding_bottom),
+              b_.getInt64(padding_left),
+              b_.getInt64(padding_right),
+              b_.getInt64(lhs_row_dilation),
+              b_.getInt64(lhs_col_dilation),
+              b_.getInt64(rhs_row_dilation),
+              b_.getInt64(rhs_col_dilation),
+          });
 
       return Status::OK();
     }
@@ -1159,11 +1144,11 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   }
 
   // Args have been computed, make the call.
-  llvm::Type* int8_ptr_type = ir_builder_.getInt8Ty()->getPointerTo();
-  llvm::Type* int32_type = ir_builder_.getInt32Ty();
-  llvm::Type* int64_type = ir_builder_.getInt64Ty();
+  llvm::Type* int8_ptr_type = b_.getInt8Ty()->getPointerTo();
+  llvm::Type* int32_type = b_.getInt32Ty();
+  llvm::Type* int64_type = b_.getInt64Ty();
   llvm::FunctionType* fft_type = llvm::FunctionType::get(
-      ir_builder_.getVoidTy(),
+      b_.getVoidTy(),
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
@@ -1180,16 +1165,15 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   fft_func->setDoesNotThrow();
   fft_func->setOnlyAccessesInaccessibleMemOrArgMem();
   const int fft_rank = fft_length.size();
-  ir_builder_.CreateCall(
+  b_.CreateCall(
       fft_func,
       {GetExecutableRunOptionsArgument(),
-       ir_builder_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type),
-       ir_builder_.CreateBitCast(operand_address, int8_ptr_type),
-       ir_builder_.getInt32(fft->fft_type()), ir_builder_.getInt32(fft_rank),
-       ir_builder_.getInt64(input_batch),
-       ir_builder_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
-       ir_builder_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
-       ir_builder_.getInt64(fft_rank > 2 ? fft_length[2] : 0)});
+       b_.CreateBitCast(GetEmittedValueFor(fft), int8_ptr_type),
+       b_.CreateBitCast(operand_address, int8_ptr_type),
+       b_.getInt32(fft->fft_type()), b_.getInt32(fft_rank),
+       b_.getInt64(input_batch), b_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
+       b_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
+       b_.getInt64(fft_rank > 2 ? fft_length[2] : 0)});
 
   return Status::OK();
 }
@@ -1228,11 +1212,10 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
     operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape));
 
     // TODO(b/63762267): Be more aggressive about specifying alignment.
-    ir_builder_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
-                             /*SrcAlign=*/1,
-                             ShapeUtil::ByteSizeOf(operand_shape));
+    b_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
+                    /*SrcAlign=*/1, ShapeUtil::ByteSizeOf(operand_shape));
   }
-  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &ir_builder_, module_);
+  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -1278,9 +1261,8 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   // example, float for an XLA F32 element type).
   llvm::Value* params = compute_function_->parameters_arg();
   llvm::Value* param_address_offset =
-      llvm_ir::EmitBufferIndexingGEP(params, param_number, &ir_builder_);
-  llvm::LoadInst* param_address_untyped =
-      ir_builder_.CreateLoad(param_address_offset);
+      llvm_ir::EmitBufferIndexingGEP(params, param_number, &b_);
+  llvm::LoadInst* param_address_untyped = b_.CreateLoad(param_address_offset);
   param_address_untyped->setName(AsStringRef(IrName(parameter, "untyped")));
   if (is_top_level_computation_ &&
       hlo_module_config_.debug_options()
@@ -1295,7 +1277,7 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
         llvm::MDNode::get(param_address_untyped->getContext(), /*MDs=*/{}));
   }
 
-  llvm::Value* param_address_typed = ir_builder_.CreateBitCast(
+  llvm::Value* param_address_typed = b_.CreateBitCast(
       param_address_untyped, IrShapeType(param_shape)->getPointerTo());
   emitted_value_[parameter] = param_address_typed;
 
@@ -1403,62 +1385,61 @@ IrEmitter::ReductionGenerator IrEmitter::MatchReductionGenerator(
       return nullptr;
 
     case HloOpcode::kAdd:
-      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+      return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs,
                                 llvm::Value* rhs) {
-        return root_is_integral ? ir_builder->CreateAdd(lhs, rhs)
-                                : ir_builder->CreateFAdd(lhs, rhs);
+        return root_is_integral ? b->CreateAdd(lhs, rhs)
+                                : b->CreateFAdd(lhs, rhs);
       };
 
     case HloOpcode::kMultiply:
-      return [root_is_integral](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
+      return [root_is_integral](llvm::IRBuilder<>* b, llvm::Value* lhs,
                                 llvm::Value* rhs) {
-        return root_is_integral ? ir_builder->CreateMul(lhs, rhs)
-                                : ir_builder->CreateFMul(lhs, rhs);
+        return root_is_integral ? b->CreateMul(lhs, rhs)
+                                : b->CreateFMul(lhs, rhs);
       };
 
     case HloOpcode::kAnd:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateAnd(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateAnd(lhs, rhs);
+      };
 
     case HloOpcode::kOr:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateOr(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateOr(lhs, rhs);
+      };
 
     case HloOpcode::kXor:
-      return [](llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                llvm::Value* rhs) { return ir_builder->CreateXor(lhs, rhs); };
+      return [](llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
+        return b->CreateXor(lhs, rhs);
+      };
 
     case HloOpcode::kMaximum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                 llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::maxnum,
-                                              {lhs, rhs}, {lhs->getType()},
-                                              ir_builder);
+                                              {lhs, rhs}, {lhs->getType()}, b);
         }
 
-        return ir_builder->CreateSelect(
-            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE
-                                                  : llvm::ICmpInst::ICMP_UGE,
-                                   lhs, rhs),
+        return b->CreateSelect(
+            b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SGE
+                                         : llvm::ICmpInst::ICMP_UGE,
+                          lhs, rhs),
             lhs, rhs);
       };
 
     case HloOpcode::kMinimum:
       return [root_is_floating_point, root_is_signed](
-                 llvm::IRBuilder<>* ir_builder, llvm::Value* lhs,
-                 llvm::Value* rhs) {
+                 llvm::IRBuilder<>* b, llvm::Value* lhs, llvm::Value* rhs) {
         if (root_is_floating_point) {
           return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::minnum,
-                                              {lhs, rhs}, {lhs->getType()},
-                                              ir_builder);
+                                              {lhs, rhs}, {lhs->getType()}, b);
         }
 
-        return ir_builder->CreateSelect(
-            ir_builder->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE
-                                                  : llvm::ICmpInst::ICMP_ULE,
-                                   lhs, rhs),
+        return b->CreateSelect(
+            b->CreateICmp(root_is_signed ? llvm::ICmpInst::ICMP_SLE
+                                         : llvm::ICmpInst::ICMP_ULE,
+                          lhs, rhs),
             lhs, rhs);
       };
   }
@@ -1527,34 +1508,31 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
   accumulator.reserve(accumulator_type.size());
   for (auto accumulator_shard_type : accumulator_type) {
     accumulator.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
-        accumulator_shard_type, "accumulator", &ir_builder_, 0));
+        accumulator_shard_type, "accumulator", &b_, 0));
   }
 
-  llvm::Value* init_value_ssa =
-      ir_builder_.CreateLoad(GetEmittedValueFor(init_value));
+  llvm::Value* init_value_ssa = b_.CreateLoad(GetEmittedValueFor(init_value));
 
   for (llvm::Value* accumulator_shard : accumulator) {
     llvm::Value* initial_value;
     auto shard_type = accumulator_shard->getType()->getPointerElementType();
     if (auto vector_type = llvm::dyn_cast<llvm::VectorType>(shard_type)) {
-      initial_value = ir_builder_.CreateVectorSplat(
-          vector_type->getNumElements(), init_value_ssa);
+      initial_value =
+          b_.CreateVectorSplat(vector_type->getNumElements(), init_value_ssa);
     } else {
       initial_value = init_value_ssa;
     }
 
-    ir_builder_.CreateAlignedStore(initial_value, accumulator_shard,
-                                   element_alignment);
+    b_.CreateAlignedStore(initial_value, accumulator_shard, element_alignment);
   }
 
   llvm_ir::ForLoopNest reduction_loop_nest(IrName(arg, "vectorized_inner"),
-                                           &ir_builder_);
+                                           &b_);
   llvm_ir::IrArray::Index reduced_dims_index =
       reduction_loop_nest.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                        "reduction_dim");
 
-  SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop_nest.GetInnerLoopBodyBasicBlock(), &b_);
 
   llvm_ir::IrArray arg_array(GetIrArrayFor(arg));
   llvm_ir::IrArray::Index input_index = reduced_dims_index;
@@ -1567,38 +1545,34 @@ IrEmitter::EmitInnerLoopForVectorizedReduction(
   }
   CHECK(output_index.end() == it);
 
-  llvm::Value* input_address = ir_builder_.CreateBitCast(
-      arg_array.EmitArrayElementAddress(input_index, &ir_builder_),
-      ir_builder_.getInt8PtrTy());
+  llvm::Value* input_address = b_.CreateBitCast(
+      arg_array.EmitArrayElementAddress(input_index, &b_), b_.getInt8PtrTy());
 
   for (int i = 0; i < accumulator.size(); i++) {
     auto input_address_typed =
-        ir_builder_.CreateBitCast(input_address, accumulator[i]->getType());
+        b_.CreateBitCast(input_address, accumulator[i]->getType());
     auto current_accumulator_value =
-        ir_builder_.CreateAlignedLoad(accumulator[i], element_alignment);
-    auto addend =
-        ir_builder_.CreateAlignedLoad(input_address_typed, element_alignment);
+        b_.CreateAlignedLoad(accumulator[i], element_alignment);
+    auto addend = b_.CreateAlignedLoad(input_address_typed, element_alignment);
     arg_array.AnnotateLoadStoreInstructionWithMetadata(addend);
 
     auto reduced_result =
-        reduction_generator(&ir_builder_, current_accumulator_value, addend);
-    ir_builder_.CreateAlignedStore(reduced_result, accumulator[i],
-                                   element_alignment);
+        reduction_generator(&b_, current_accumulator_value, addend);
+    b_.CreateAlignedStore(reduced_result, accumulator[i], element_alignment);
 
     if (i != (accumulator.size() - 1)) {
-      input_address = ir_builder_.CreateConstInBoundsGEP1_32(
-          reduced_result->getType(), input_address_typed, 1);
+      input_address = b_.CreateConstInBoundsGEP1_32(reduced_result->getType(),
+                                                    input_address_typed, 1);
     }
   }
 
-  SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(),
-                        &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop_nest.GetOuterLoopExitBasicBlock(), &b_);
 
   ShardedVector result_ssa;
   result_ssa.reserve(accumulator.size());
   for (auto accumulator_shard : accumulator) {
     result_ssa.push_back(
-        ir_builder_.CreateAlignedLoad(accumulator_shard, element_alignment));
+        b_.CreateAlignedLoad(accumulator_shard, element_alignment));
   }
   return result_ssa;
 }
@@ -1607,17 +1581,17 @@ void IrEmitter::EmitShardedVectorStore(
     llvm::Value* store_address, const std::vector<llvm::Value*>& value_to_store,
     const int alignment, const llvm_ir::IrArray& containing_array) {
   for (int i = 0; i < value_to_store.size(); i++) {
-    auto store_address_typed = ir_builder_.CreateBitCast(
+    auto store_address_typed = b_.CreateBitCast(
         store_address,
         llvm::PointerType::getUnqual(value_to_store[i]->getType()));
 
-    auto store_instruction = ir_builder_.CreateAlignedStore(
+    auto store_instruction = b_.CreateAlignedStore(
         value_to_store[i], store_address_typed, alignment);
     containing_array.AnnotateLoadStoreInstructionWithMetadata(
         store_instruction);
 
     if (i != (value_to_store.size() - 1)) {
-      store_address = ir_builder_.CreateConstInBoundsGEP1_32(
+      store_address = b_.CreateConstInBoundsGEP1_32(
           value_to_store[i]->getType(), store_address_typed, 1);
     }
   }
@@ -1683,8 +1657,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   //    }
   //  }
 
-  llvm_ir::ForLoopNest loop_nest(IrName(reduce), &ir_builder_);
-  llvm_ir::IrArray::Index array_index(ir_builder_.getInt64Ty(),
+  llvm_ir::ForLoopNest loop_nest(IrName(reduce), &b_);
+  llvm_ir::IrArray::Index array_index(b_.getInt64Ty(),
                                       reduce->shape().dimensions_size());
   for (int i = LayoutUtil::MinorToMajor(reduce->shape()).size() - 1; i > 0;
        --i) {
@@ -1703,7 +1677,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
   if (llvm::BasicBlock* innermost_body_bb =
           loop_nest.GetInnerLoopBodyBasicBlock()) {
-    SetToFirstInsertPoint(innermost_body_bb, &ir_builder_);
+    SetToFirstInsertPoint(innermost_body_bb, &b_);
   }
 
   auto outermost_loop_exit_block = loop_nest.GetOuterLoopExitBasicBlock();
@@ -1717,7 +1691,7 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
         tensorflow::strings::Printf("dim.%lld", innermost_dimension));
     array_index[innermost_dimension] = loop->GetIndVarValue();
 
-    SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loop->GetBodyBasicBlock(), &b_);
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(), vectorization_factor);
@@ -1728,16 +1702,16 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
     llvm_ir::IrArray target_array = GetIrArrayFor(reduce);
     llvm::Value* output_address =
-        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+        target_array.EmitArrayElementAddress(array_index, &b_);
     EmitShardedVectorStore(output_address, accumulator, element_alignment,
                            target_array);
 
     if (auto exit_terminator = loop->GetExitBasicBlock()->getTerminator()) {
       CHECK_GT(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
-      ir_builder_.SetInsertPoint(exit_terminator);
+      b_.SetInsertPoint(exit_terminator);
     } else {
       CHECK_EQ(LayoutUtil::MinorToMajor(reduce->shape()).size(), 1);
-      ir_builder_.SetInsertPoint(loop->GetExitBasicBlock());
+      b_.SetInsertPoint(loop->GetExitBasicBlock());
     }
   }
 
@@ -1747,8 +1721,8 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   if (innermost_dimension_size % vectorization_factor) {
     // TODO(b/63775531): Consider using a scalar loop here to save on code size.
     array_index[innermost_dimension] =
-        ir_builder_.getInt64(innermost_dimension_size -
-                             (innermost_dimension_size % vectorization_factor));
+        b_.getInt64(innermost_dimension_size -
+                    (innermost_dimension_size % vectorization_factor));
 
     ShardedVectorType vector_type = CreateShardedVectorType(
         reduce->shape().element_type(),
@@ -1760,13 +1734,13 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
 
     llvm_ir::IrArray target_array = GetIrArrayFor(reduce);
     llvm::Value* output_address =
-        target_array.EmitArrayElementAddress(array_index, &ir_builder_);
+        target_array.EmitArrayElementAddress(array_index, &b_);
     EmitShardedVectorStore(output_address, accumulator, element_alignment,
                            target_array);
   }
 
   if (outermost_loop_exit_block) {
-    ir_builder_.SetInsertPoint(outermost_loop_exit_block);
+    b_.SetInsertPoint(outermost_loop_exit_block);
   }
 
   return true;
@@ -1785,22 +1759,22 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
   PrimitiveType accumulator_type = reduce->shape().element_type();
   llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_), "accumulator",
-      &ir_builder_, MinimumAlignmentForPrimitiveType(accumulator_type));
+      &b_, MinimumAlignmentForPrimitiveType(accumulator_type));
   llvm::Value* init_value_addr = GetEmittedValueFor(init_value);
-  llvm::Value* load_init_value = ir_builder_.CreateLoad(init_value_addr);
-  ir_builder_.CreateStore(load_init_value, accumulator_addr);
+  llvm::Value* load_init_value = b_.CreateLoad(init_value_addr);
+  b_.CreateStore(load_init_value, accumulator_addr);
 
   // The enclosing loops go over all the target elements. Now we have to compute
   // the actual target element. For this, we build a new loop nest to iterate
   // over all the reduction dimensions in the argument.
   // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
   // are placed for each dimension in dimensions, and all the rest are nullptrs.
-  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
   const llvm_ir::IrArray::Index reduced_dims_index =
       loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                          "reduction_dim");
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Build a full index for the input argument, using reduced_dims_index as the
   // base. In reduced_dims_index only the reduction dimensions are filled in. We
@@ -1820,14 +1794,14 @@ StatusOr<llvm::Value*> IrEmitter::EmitTargetElementLoopBodyForReduce(
 
   // Apply the reduction function to the loaded value.
   llvm::Value* input_address =
-      arg_array.EmitArrayElementAddress(input_index, &ir_builder_);
+      arg_array.EmitArrayElementAddress(input_index, &b_);
   llvm::Value* result = EmitElementFunctionCall(
       reducer_function, reduce->shape(), {accumulator_addr, input_address},
       "reduce_function");
-  ir_builder_.CreateStore(result, accumulator_addr);
+  b_.CreateStore(result, accumulator_addr);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-  return ir_builder_.CreateLoad(accumulator_addr);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+  return b_.CreateLoad(accumulator_addr);
 }
 
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
@@ -1957,7 +1931,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   llvm_ir::IrArray target_array = GetIrArrayFor(slice);
 
   const int64 num_outer_loops = outer_dims.size();
-  llvm_ir::ForLoopNest loops(IrName(slice), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(slice), &b_);
   llvm_ir::IrArray::Index target_index =
       loops.AddLoopsForShapeOnDimensions(slice->shape(), outer_dims, "slice");
 
@@ -1966,21 +1940,21 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   // for the rest of the dimensions the copy writes to the full dimension.
   std::replace(target_index.begin(), target_index.end(),
                static_cast<llvm::Value*>(nullptr),
-               static_cast<llvm::Value*>(ir_builder_.getInt64(0)));
+               static_cast<llvm::Value*>(b_.getInt64(0)));
 
   if (num_outer_loops > 0) {
-    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
   }
 
   llvm_ir::IrArray source_array = GetIrArrayFor(operand);
   const llvm_ir::IrArray::Index source_index = target_index.SourceIndexOfSlice(
       /*shape=*/slice->shape(), /*starts=*/slice->slice_starts(),
-      /*strides=*/slice->slice_strides(), /*builder=*/&ir_builder_);
+      /*strides=*/slice->slice_strides(), /*builder=*/&b_);
 
-  llvm::Value* memcpy_dest = target_array.EmitArrayElementAddress(
-      target_index, &ir_builder_, "slice.dest");
-  llvm::Value* memcpy_source = source_array.EmitArrayElementAddress(
-      source_index, &ir_builder_, "slice.source");
+  llvm::Value* memcpy_dest =
+      target_array.EmitArrayElementAddress(target_index, &b_, "slice.dest");
+  llvm::Value* memcpy_source =
+      source_array.EmitArrayElementAddress(source_index, &b_, "slice.source");
 
   const int64 memcpy_elements =
       primitive_elements_per_logical_element * memcpy_logical_elements;
@@ -1997,7 +1971,7 @@ Status IrEmitter::HandleSlice(HloInstruction* slice) {
   }
 
   if (num_outer_loops > 0) {
-    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   }
 
   return Status::OK();
@@ -2023,7 +1997,7 @@ Status IrEmitter::HandleDynamicUpdateSlice(
     auto operands = GetIrArraysForOperandsOf(dynamic_update_slice);
     return llvm_ir::EmitDynamicUpdateSliceInPlace(
         operands, GetIrArrayFor(dynamic_update_slice),
-        IrName(dynamic_update_slice, "in_place"), &ir_builder_);
+        IrName(dynamic_update_slice, "in_place"), &b_);
   }
   return DefaultAction(dynamic_update_slice);
 }
@@ -2057,43 +2031,41 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
       [this, pad](const llvm_ir::IrArray::Index& target_index) {
         const HloInstruction* padding_value = pad->operand(1);
         llvm::Value* padding_value_addr = GetEmittedValueFor(padding_value);
-        return ir_builder_.CreateLoad(padding_value_addr);
+        return b_.CreateLoad(padding_value_addr);
       }));
 
   // Create a loop to iterate over the operand elements and update the output
   // locations where the operand elements should be stored.
-  llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(pad, "assign"), &b_);
   const HloInstruction* operand = pad->operand(0);
   const llvm_ir::IrArray::Index operand_index =
       loops.AddLoopsForShape(operand->shape(), "operand");
 
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
   // Load an element from the operand.
   llvm_ir::IrArray operand_array(GetIrArrayFor(operand));
   llvm::Value* operand_data =
-      operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
+      operand_array.EmitReadArrayElement(operand_index, &b_);
 
   // Compute the output index the operand element should be assigned to.
   // output_index := edge_padding_low + operand_index * (interior_padding + 1)
   const PaddingConfig& padding_config = pad->padding_config();
   llvm_ir::IrArray::Index output_index(operand_index.GetType());
   for (size_t i = 0; i < operand_index.size(); ++i) {
-    llvm::Value* offset = ir_builder_.CreateMul(
+    llvm::Value* offset = b_.CreateMul(
         operand_index[i],
-        ir_builder_.getInt64(padding_config.dimensions(i).interior_padding() +
-                             1));
-    llvm::Value* index = ir_builder_.CreateAdd(
-        offset,
-        ir_builder_.getInt64(padding_config.dimensions(i).edge_padding_low()));
+        b_.getInt64(padding_config.dimensions(i).interior_padding() + 1));
+    llvm::Value* index = b_.CreateAdd(
+        offset, b_.getInt64(padding_config.dimensions(i).edge_padding_low()));
     output_index.push_back(index);
   }
 
   // Store the operand element to the computed output location.
   llvm_ir::IrArray output_array(GetIrArrayFor(pad));
-  output_array.EmitWriteArrayElement(output_index, operand_data, &ir_builder_);
+  output_array.EmitWriteArrayElement(output_index, operand_data, &b_);
 
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   return Status::OK();
 }
 
@@ -2115,8 +2087,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     // Delegate to common implementation of fused in-place dynamic-update-slice.
     auto operands = GetIrArraysForOperandsOf(fusion);
     return llvm_ir::EmitFusedDynamicUpdateSliceInPlace(
-        fusion, operands, GetIrArrayFor(fusion), &elemental_emitter,
-        &ir_builder_);
+        fusion, operands, GetIrArrayFor(fusion), &elemental_emitter, &b_);
   } else if (fusion->fusion_kind() == HloInstruction::FusionKind::kLoop) {
     VLOG(3) << "HandleFusion kLoop";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
@@ -2151,7 +2122,7 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
 
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
         *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+        GetExecutableRunOptionsArgument(), &b_, hlo_module_config_,
         target_machine_features_));
     return Status::OK();
   } else {
@@ -2174,7 +2145,7 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
     // ParallelTaskAssignment assigned partitions, emit call to
     // ParallelForkJoin.
     std::vector<llvm::Value*> call_args = GetArrayFunctionCallArguments(
-        parameter_addresses, &ir_builder_, computation->name(),
+        parameter_addresses, &b_, computation->name(),
         /*return_value_buffer=*/emitted_value_[call],
         /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
         /*temp_buffers_arg=*/GetTempBuffersArgument(),
@@ -2182,8 +2153,8 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 
     HloInstruction* root = computation->root_instruction();
     TF_RETURN_IF_ERROR(EmitCallToParallelForkJoin(
-        call_args, root->shape(), root->outer_dimension_partitions(),
-        &ir_builder_, call_ir_function, computation->name()));
+        call_args, root->shape(), root->outer_dimension_partitions(), &b_,
+        call_ir_function, computation->name()));
   } else {
     EmitArrayFunctionCallInto(call_ir_function, parameter_addresses,
                               emitted_value_[call], computation->name());
@@ -2195,33 +2166,31 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   gtl::ArraySlice<HloInstruction*> operands(custom_call->operands());
   tensorflow::StringPiece custom_call_target(custom_call->custom_call_target());
-  llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
+  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          i8_ptr_type, ir_builder_.getInt32(operands.size()),
-          "cc_operands_alloca", &ir_builder_);
+          i8_ptr_type, b_.getInt32(operands.size()), "cc_operands_alloca", &b_);
   for (size_t i = 0; i < operands.size(); ++i) {
     const HloInstruction* operand = operands[i];
     llvm::Value* operand_as_i8ptr =
-        ir_builder_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type);
-    llvm::Value* slot_in_operands_alloca = ir_builder_.CreateInBoundsGEP(
-        operands_alloca, {ir_builder_.getInt64(i)});
-    ir_builder_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca);
+        b_.CreatePointerCast(GetEmittedValueFor(operand), i8_ptr_type);
+    llvm::Value* slot_in_operands_alloca =
+        b_.CreateInBoundsGEP(operands_alloca, {b_.getInt64(i)});
+    b_.CreateStore(operand_as_i8ptr, slot_in_operands_alloca);
   }
   auto* custom_call_ir_function =
       llvm::cast<llvm::Function>(module_->getOrInsertFunction(
           AsStringRef(custom_call_target),
           llvm::FunctionType::get(
-              /*Result=*/ir_builder_.getVoidTy(),
+              /*Result=*/b_.getVoidTy(),
               /*Params=*/{i8_ptr_type, operands_alloca->getType()},
               /*isVarArg=*/false)));
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
-  auto* output_address_arg = ir_builder_.CreatePointerCast(
-      GetEmittedValueFor(custom_call), i8_ptr_type);
+  auto* output_address_arg =
+      b_.CreatePointerCast(GetEmittedValueFor(custom_call), i8_ptr_type);
 
-  ir_builder_.CreateCall(custom_call_ir_function,
-                         {output_address_arg, operands_alloca});
+  b_.CreateCall(custom_call_ir_function, {output_address_arg, operands_alloca});
 
   return Status::OK();
 }
@@ -2286,8 +2255,8 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   llvm::BasicBlock* header_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "header")),
       compute_function_->function());
-  ir_builder_.CreateBr(header_bb);
-  ir_builder_.SetInsertPoint(header_bb);
+  b_.CreateBr(header_bb);
+  b_.SetInsertPoint(header_bb);
 
   // Calls the condition function to determine whether to proceed with the
   // body.  It must return a bool, so use the scalar call form.
@@ -2295,7 +2264,7 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
   llvm::Value* while_condition = EmitElementFunctionCall(
       condition_ir_function, condition->root_instruction()->shape(),
       {while_result}, IrName(xla_while, "cond"));
-  llvm::Value* while_predicate = ir_builder_.CreateICmpNE(
+  llvm::Value* while_predicate = b_.CreateICmpNE(
       while_condition,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0));
 
@@ -2305,20 +2274,20 @@ Status IrEmitter::HandleWhile(HloInstruction* xla_while) {
       compute_function_->function());
   llvm::BasicBlock* exit_bb = llvm::BasicBlock::Create(
       module_->getContext(), AsStringRef(IrName(xla_while, "exit")));
-  ir_builder_.CreateCondBr(while_predicate, body_bb, exit_bb);
+  b_.CreateCondBr(while_predicate, body_bb, exit_bb);
 
   // Calls the body function from the body block.
-  ir_builder_.SetInsertPoint(body_bb);
+  b_.SetInsertPoint(body_bb);
 
   // Calls the body function.
   EmitArrayFunctionCallInto(body_ir_function, {while_result}, while_result,
                             IrName(xla_while, "body"));
   // Finishes with a branch back to the header.
-  ir_builder_.CreateBr(header_bb);
+  b_.CreateBr(header_bb);
 
   // Adds the exit block to the function and sets the insert point there.
   compute_function_->function()->getBasicBlockList().push_back(exit_bb);
-  ir_builder_.SetInsertPoint(exit_bb);
+  b_.SetInsertPoint(exit_bb);
 
   return Status::OK();
 }
@@ -2360,21 +2329,21 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   std::vector<int64> outer_dims(std::next(concat_dim_layout_itr),
                                 output_min2maj.end());
 
-  llvm::Type* i8_ptr_type = ir_builder_.getInt8PtrTy();
-  llvm::Type* i8_type = ir_builder_.getInt8Ty();
+  llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
+  llvm::Type* i8_type = b_.getInt8Ty();
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(concatenate));
   llvm_ir::IrArray target_array = GetIrArrayFor(concatenate);
 
-  llvm_ir::ForLoopNest loops(IrName(concatenate), &ir_builder_);
+  llvm_ir::ForLoopNest loops(IrName(concatenate), &b_);
   llvm_ir::IrArray::Index outer_dims_index =
       loops.AddLoopsForShapeOnDimensions(output_shape, outer_dims, "concat");
   std::replace(outer_dims_index.begin(), outer_dims_index.end(),
                static_cast<llvm::Value*>(nullptr),
-               static_cast<llvm::Value*>(ir_builder_.getInt64(0)));
+               static_cast<llvm::Value*>(b_.getInt64(0)));
 
   if (!outer_dims.empty()) {
-    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
   }
 
   PrimitiveType primitive_type = output_shape.element_type();
@@ -2383,10 +2352,10 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
 
   // Contiguous subregions from each operand to the concatenate contribute to a
   // contiguous subregion in the target buffer starting at target_region_begin.
-  llvm::Value* target_region_begin = ir_builder_.CreateBitCast(
-      target_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_,
-                                           "target_region"),
-      i8_ptr_type);
+  llvm::Value* target_region_begin =
+      b_.CreateBitCast(target_array.EmitArrayElementAddress(
+                           outer_dims_index, &b_, "target_region"),
+                       i8_ptr_type);
   int64 byte_offset_into_target_region = 0;
 
   int64 inner_dims_product =
@@ -2400,14 +2369,13 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   for (HloInstruction* operand : operands) {
     const Shape& input_shape = operand->shape();
     llvm_ir::IrArray source_array = GetIrArrayFor(operand);
-    llvm::Value* copy_source_address = ir_builder_.CreateBitCast(
-        source_array.EmitArrayElementAddress(outer_dims_index, &ir_builder_,
-                                             "src_addr"),
+    llvm::Value* copy_source_address = b_.CreateBitCast(
+        source_array.EmitArrayElementAddress(outer_dims_index, &b_, "src_addr"),
         i8_ptr_type);
 
-    llvm::Value* copy_target_address = ir_builder_.CreateGEP(
-        i8_type, target_region_begin,
-        ir_builder_.getInt64(byte_offset_into_target_region));
+    llvm::Value* copy_target_address =
+        b_.CreateGEP(i8_type, target_region_begin,
+                     b_.getInt64(byte_offset_into_target_region));
 
     EmitTransferElements(
         copy_target_address, copy_source_address,
@@ -2420,7 +2388,7 @@ StatusOr<bool> IrEmitter::EmitFastConcatenate(
   }
 
   if (!outer_dims.empty()) {
-    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
+    SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
   }
 
   return true;
@@ -2439,16 +2407,15 @@ void IrEmitter::EmitTransferElements(llvm::Value* target, llvm::Value* source,
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_));
 
   if (element_count == 1) {
-    auto* load_instruction = ir_builder_.CreateAlignedLoad(
-        ir_builder_.CreateBitCast(source, primitive_ptr_type),
-        element_alignment);
+    auto* load_instruction = b_.CreateAlignedLoad(
+        b_.CreateBitCast(source, primitive_ptr_type), element_alignment);
     source_array.AnnotateLoadStoreInstructionWithMetadata(load_instruction);
-    auto* store_instruction = ir_builder_.CreateAlignedStore(
-        load_instruction, ir_builder_.CreateBitCast(target, primitive_ptr_type),
+    auto* store_instruction = b_.CreateAlignedStore(
+        load_instruction, b_.CreateBitCast(target, primitive_ptr_type),
         element_alignment);
     target_array.AnnotateLoadStoreInstructionWithMetadata(store_instruction);
   } else {
-    auto* memcpy_instruction = ir_builder_.CreateMemCpy(
+    auto* memcpy_instruction = b_.CreateMemCpy(
         target, /*DstAlign=*/element_alignment, source,
         /*SrcAlign=*/element_alignment, element_count * primitive_type_size);
 
@@ -2518,24 +2485,24 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   //     cond_result = true_computation(true_operand)
   //   else
   //     cond_result = false_computation(false_operand)
-  llvm::LoadInst* pred_value = ir_builder_.CreateLoad(
+  llvm::LoadInst* pred_value = b_.CreateLoad(
       GetIrArrayFor(pred).GetBasePointer(), "load_predicate_value");
-  llvm::Value* pred_cond = ir_builder_.CreateICmpNE(
+  llvm::Value* pred_cond = b_.CreateICmpNE(
       pred_value,
       llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
       "boolean_predicate");
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &ir_builder_);
+      llvm_ir::EmitIfThenElse(pred_cond, "conditional", &b_);
 
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, &b_);
   EmitArrayFunctionCallInto(true_function, {GetEmittedValueFor(true_arg)},
                             conditional_result, IrName(conditional, "_true"));
 
-  SetToFirstInsertPoint(if_data.false_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, &b_);
   EmitArrayFunctionCallInto(false_function, {GetEmittedValueFor(false_arg)},
                             conditional_result, IrName(conditional, "_false"));
 
-  SetToFirstInsertPoint(if_data.after_block, &ir_builder_);
+  SetToFirstInsertPoint(if_data.after_block, &b_);
   return Status::OK();
 }
 
@@ -2546,6 +2513,28 @@ Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleIota(HloInstruction* iota) {
+  // TODO(b/64798317): implement iota on CPU.
+  return Unimplemented("Iota is not implemented on CPU.");
+}
+
+Status IrEmitter::HandleRng(HloInstruction* rng) {
+  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
+  for (const HloInstruction* operand : rng->operands()) {
+    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
+      return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_);
+    };
+  }
+
+  CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
+  TF_RETURN_IF_ERROR(EmitTargetElementLoop(
+      rng, elemental_emitter.MakeElementGenerator(rng, operand_to_generator)));
+
+  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
+
+  return Status::OK();
+}
+
 Status IrEmitter::FinishVisit(HloInstruction* root) {
   // When this method is called, we should have already emitted an IR value for
   // the root (return) op. The IR value holds the address of the buffer holding
@@ -2563,7 +2552,7 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
 
   auto record_complete_computation = [&](llvm::Value* prof_counter) {
     if (prof_counter) {
-      profiling_state_.RecordCompleteComputation(&ir_builder_, prof_counter);
+      profiling_state_.RecordCompleteComputation(&b_, prof_counter);
     }
   };
 
@@ -2585,54 +2574,51 @@ llvm::Value* IrEmitter::GetProfileCounterCommon(
 
   int64 prof_counter_idx = it->second;
   string counter_name = IrName("prof_counter", hlo.name());
-  return ir_builder_.CreateGEP(GetProfileCountersArgument(),
-                               ir_builder_.getInt64(prof_counter_idx),
-                               AsStringRef(counter_name));
+  return b_.CreateGEP(GetProfileCountersArgument(),
+                      b_.getInt64(prof_counter_idx), AsStringRef(counter_name));
 }
 
-void IrEmitter::ProfilingState::UpdateProfileCounter(
-    llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter,
-    llvm::Value* cycle_end, llvm::Value* cycle_start) {
-  auto* cycle_diff = ir_builder->CreateSub(cycle_end, cycle_start);
+void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b,
+                                                     llvm::Value* prof_counter,
+                                                     llvm::Value* cycle_end,
+                                                     llvm::Value* cycle_start) {
+  auto* cycle_diff = b->CreateSub(cycle_end, cycle_start);
   llvm::LoadInst* old_cycle_count =
-      ir_builder->CreateLoad(prof_counter, "old_cycle_count");
+      b->CreateLoad(prof_counter, "old_cycle_count");
   auto* new_cycle_count =
-      ir_builder->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count");
-  ir_builder->CreateStore(new_cycle_count, prof_counter);
+      b->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count");
+  b->CreateStore(new_cycle_count, prof_counter);
 }
 
-llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+llvm::Value* IrEmitter::ProfilingState::ReadCycleCounter(llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
   if (use_rdtscp_) {
     llvm::Function* func_llvm_readcyclecounter =
         llvm::Intrinsic::getDeclaration(module,
                                         llvm::Intrinsic::readcyclecounter);
-    return ir_builder->CreateCall(func_llvm_readcyclecounter);
+    return b->CreateCall(func_llvm_readcyclecounter);
   }
   llvm::Function* func_llvm_x86_rdtscp =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::x86_rdtscp);
   if (!aux_i8ptr_) {
-    llvm::AllocaInst* rdtscp_aux = llvm_ir::EmitAllocaAtFunctionEntry(
-        ir_builder->getInt32Ty(), "rdtscp_aux", ir_builder);
-    aux_i8ptr_ =
-        ir_builder->CreateBitCast(rdtscp_aux, ir_builder->getInt8PtrTy());
+    llvm::AllocaInst* rdtscp_aux =
+        llvm_ir::EmitAllocaAtFunctionEntry(b->getInt32Ty(), "rdtscp_aux", b);
+    aux_i8ptr_ = b->CreateBitCast(rdtscp_aux, b->getInt8PtrTy());
   }
-  llvm::ConstantInt* alloca_size = ir_builder->getInt64(4);
+  llvm::ConstantInt* alloca_size = b->getInt64(4);
   llvm::Function* func_llvm_lifetime_start =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_start);
-  ir_builder->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_});
-  llvm::Value* rdtscp_call =
-      ir_builder->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_);
+  b->CreateCall(func_llvm_lifetime_start, {alloca_size, aux_i8ptr_});
+  llvm::Value* rdtscp_call = b->CreateCall(func_llvm_x86_rdtscp, aux_i8ptr_);
   llvm::Function* func_llvm_lifetime_end =
       llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::lifetime_end);
-  ir_builder->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_});
+  b->CreateCall(func_llvm_lifetime_end, {alloca_size, aux_i8ptr_});
   return rdtscp_call;
 }
 
-void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder,
+void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo) {
-  auto* cycle_start = ReadCycleCounter(ir_builder);
+  auto* cycle_start = ReadCycleCounter(b);
   cycle_start->setName(AsStringRef(IrName(hlo, "cycle_start")));
   cycle_starts_[hlo] = cycle_start;
   if (first_read_cycle_start_ == nullptr) {
@@ -2640,20 +2626,20 @@ void IrEmitter::ProfilingState::RecordCycleStart(llvm::IRBuilder<>* ir_builder,
   }
 }
 
-void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* ir_builder,
+void IrEmitter::ProfilingState::RecordCycleDelta(llvm::IRBuilder<>* b,
                                                  HloInstruction* hlo,
                                                  llvm::Value* prof_counter) {
-  auto* cycle_end = ReadCycleCounter(ir_builder);
+  auto* cycle_end = ReadCycleCounter(b);
   cycle_end->setName(AsStringRef(IrName(hlo, "cycle_end")));
   auto* cycle_start = cycle_starts_[hlo];
-  UpdateProfileCounter(ir_builder, prof_counter, cycle_end, cycle_start);
+  UpdateProfileCounter(b, prof_counter, cycle_end, cycle_start);
   last_read_cycle_end_ = cycle_end;
 }
 
 void IrEmitter::ProfilingState::RecordCompleteComputation(
-    llvm::IRBuilder<>* ir_builder, llvm::Value* prof_counter) {
+    llvm::IRBuilder<>* b, llvm::Value* prof_counter) {
   if (last_read_cycle_end_ && first_read_cycle_start_) {
-    UpdateProfileCounter(ir_builder, prof_counter, last_read_cycle_end_,
+    UpdateProfileCounter(b, prof_counter, last_read_cycle_end_,
                          first_read_cycle_start_);
   }
 }
@@ -2661,14 +2647,14 @@ void IrEmitter::ProfilingState::RecordCompleteComputation(
 Status IrEmitter::Preprocess(HloInstruction* hlo) {
   VLOG(3) << "Visiting: " << hlo->ToString();
   if (instruction_to_profile_idx_.count(hlo)) {
-    profiling_state_.RecordCycleStart(&ir_builder_, hlo);
+    profiling_state_.RecordCycleStart(&b_, hlo);
   }
   return Status::OK();
 }
 
 Status IrEmitter::Postprocess(HloInstruction* hlo) {
   if (auto* prof_counter = GetProfileCounterFor(*hlo)) {
-    profiling_state_.RecordCycleDelta(&ir_builder_, hlo, prof_counter);
+    profiling_state_.RecordCycleDelta(&b_, hlo, prof_counter);
   }
   return Status::OK();
 }
@@ -2727,22 +2713,24 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
     CHECK_EQ(1, assigned_buffers.size());
     const Shape& shape = assigned_buffers.begin()->first->shape();
 
-    llvm::AllocaInst*& tempbuf_address = thread_local_buffers_[{
-        ir_builder_.GetInsertBlock()->getParent(), slice}];
+    llvm::AllocaInst*& tempbuf_address =
+        thread_local_buffers_[{b_.GetInsertBlock()->getParent(), slice}];
     if (tempbuf_address == nullptr) {
       tempbuf_address = llvm_ir::EmitAllocaAtFunctionEntry(
           IrShapeType(shape),
-          tensorflow::strings::StrCat("thread_local", slice.ToString()),
-          &ir_builder_, MinimumAlignmentForShape(target_shape));
+          tensorflow::strings::StrCat("thread_local", slice.ToString()), &b_,
+          MinimumAlignmentForShape(target_shape));
     }
-    return ir_builder_.CreateBitCast(tempbuf_address,
-                                     element_type->getPointerTo());
+    return b_.CreateBitCast(tempbuf_address, element_type->getPointerTo());
+  }
+
+  if (allocation.is_constant()) {
+    return FindOrDie(constant_buffer_to_global_, allocation.index());
   }
 
   llvm::Value* tempbuf_address_ptr = llvm_ir::EmitBufferIndexingGEP(
-      GetTempBuffersArgument(), slice.index(), &ir_builder_);
-  llvm::LoadInst* tempbuf_address_base =
-      ir_builder_.CreateLoad(tempbuf_address_ptr);
+      GetTempBuffersArgument(), slice.index(), &b_);
+  llvm::LoadInst* tempbuf_address_base = b_.CreateLoad(tempbuf_address_ptr);
   if (is_top_level_computation_ &&
       hlo_module_config_.debug_options()
           .xla_llvm_enable_invariant_load_metadata()) {
@@ -2761,11 +2749,11 @@ llvm::Value* IrEmitter::EmitTempBufferPointer(
   llvm::Value* tempbuf_address_untyped = tempbuf_address_base;
   if (slice.offset() > 0) {
     // Adjust the address to account for the slice offset.
-    tempbuf_address_untyped = ir_builder_.CreateInBoundsGEP(
-        tempbuf_address_base, ir_builder_.getInt64(slice.offset()));
+    tempbuf_address_untyped =
+        b_.CreateInBoundsGEP(tempbuf_address_base, b_.getInt64(slice.offset()));
   }
-  return ir_builder_.CreateBitCast(tempbuf_address_untyped,
-                                   element_type->getPointerTo());
+  return b_.CreateBitCast(tempbuf_address_untyped,
+                          element_type->getPointerTo());
 }
 
 // Emits a function call returning a single array element.  Allocates space
@@ -2776,7 +2764,7 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
     tensorflow::StringPiece name) {
   llvm::Value* return_value_buffer = EmitArrayFunctionCall(
       function, return_shape, 1, parameter_addresses, name);
-  return ir_builder_.CreateLoad(
+  return b_.CreateLoad(
       return_value_buffer,
       AsStringRef(tensorflow::strings::StrCat(name, "_return_value")));
 }
@@ -2794,9 +2782,9 @@ llvm::Value* IrEmitter::EmitElementFunctionCall(
 void IrEmitter::EmitArrayFunctionCallInto(
     llvm::Function* function, gtl::ArraySlice<llvm::Value*> parameter_addresses,
     llvm::Value* return_value_buffer, tensorflow::StringPiece name) {
-  ir_builder_.CreateCall(
-      function, GetArrayFunctionCallArguments(
-                    parameter_addresses, &ir_builder_, name,
+  b_.CreateCall(function,
+                GetArrayFunctionCallArguments(
+                    parameter_addresses, &b_, name,
                     /*return_value_buffer=*/return_value_buffer,
                     /*exec_run_options_arg=*/GetExecutableRunOptionsArgument(),
                     /*temp_buffers_arg=*/GetTempBuffersArgument(),
@@ -2808,13 +2796,13 @@ llvm::Value* IrEmitter::EmitArrayFunctionCall(
     gtl::ArraySlice<llvm::Value*> parameter_addresses,
     tensorflow::StringPiece name) {
   llvm::Value* elements =
-      llvm::ConstantInt::get(ir_builder_.getInt64Ty(), element_count);
+      llvm::ConstantInt::get(b_.getInt64Ty(), element_count);
   PrimitiveType return_type = return_shape.element_type();
   llvm::Value* return_value_buffer =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
           llvm_ir::PrimitiveTypeToIrType(return_type, module_), elements,
-          tensorflow::strings::StrCat(name, "_return_value_address"),
-          &ir_builder_, MinimumAlignmentForPrimitiveType(return_type));
+          tensorflow::strings::StrCat(name, "_return_value_address"), &b_,
+          MinimumAlignmentForPrimitiveType(return_type));
   EmitArrayFunctionCallInto(function, parameter_addresses, return_value_buffer,
                             name);
   return return_value_buffer;
@@ -2836,8 +2824,7 @@ Status IrEmitter::EmitTargetAddressForOp(const HloInstruction* op) {
       attr_builder.addDereferenceableAttr(ByteSizeOf(target_shape));
       retval->addAttrs(attr_builder);
     }
-    addr = ir_builder_.CreateBitCast(retval,
-                                     IrShapeType(target_shape)->getPointerTo());
+    addr = b_.CreateBitCast(retval, IrShapeType(target_shape)->getPointerTo());
   } else {
     // For other nodes, we need the temporary buffer allocated for this node to
     // write the result into.
@@ -2879,14 +2866,14 @@ Status IrEmitter::EmitTargetElementLoop(
           llvm_ir::IrArray(op_target_address, element_shape));
     }
     TF_RETURN_IF_ERROR(
-        llvm_ir::LoopEmitter(element_generator, output_arrays, &ir_builder_)
+        llvm_ir::LoopEmitter(element_generator, output_arrays, &b_)
             .EmitLoop(IrName(target_op)));
 
     std::vector<llvm::Value*> tuple_operand_ptrs;
     for (int64 i = 0; i < output_arrays.size(); ++i) {
       tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
     }
-    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &ir_builder_, module_);
+    llvm_ir::EmitTuple(target_array, tuple_operand_ptrs, &b_, module_);
 
   } else {
     if (ShouldEmitParallelLoopFor(*target_op)) {
@@ -2895,11 +2882,11 @@ Status IrEmitter::EmitTargetElementLoop(
           compute_function_->GetDynamicLoopBounds();
       // Emit parallel loop with dynamic loop bounds for most-major dimensions.
       TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, target_array,
-                                             &dynamic_loop_bounds, &ir_builder_)
+                                             &dynamic_loop_bounds, &b_)
                              .EmitLoop(IrName(target_op)));
     } else {
       TF_RETURN_IF_ERROR(
-          llvm_ir::LoopEmitter(element_generator, target_array, &ir_builder_)
+          llvm_ir::LoopEmitter(element_generator, target_array, &b_)
               .EmitLoop(IrName(target_op)));
     }
   }
@@ -2912,8 +2899,8 @@ Status IrEmitter::EmitMemcpy(const HloInstruction& source,
   llvm::Value* destination_value = GetEmittedValueFor(&destination);
   int64 source_size = ByteSizeOf(source.shape());
   // TODO(b/63762267): Be more aggressive about specifying alignment.
-  ir_builder_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
-                           /*SrcAlign=*/1, source_size);
+  b_.CreateMemCpy(destination_value, /*DstAlign=*/1, source_value,
+                  /*SrcAlign=*/1, source_size);
   return Status::OK();
 }
 
@@ -2941,7 +2928,7 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArrayFor(operand).EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArrayFor(operand).EmitReadArrayElement(index, &b_);
     };
   }
   CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
@@ -2956,8 +2943,8 @@ StatusOr<llvm::Value*> IrEmitter::EmitScalarCall(
   std::vector<llvm::Value*> argument_addrs;
   for (auto argument : arguments) {
     llvm::Value* argument_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-        argument->getType(), "arg_addr", &ir_builder_);
-    ir_builder_.CreateStore(argument, argument_addr);
+        argument->getType(), "arg_addr", &b_);
+    b_.CreateStore(argument, argument_addr);
     argument_addrs.push_back(argument_addr);
   }
   return EmitElementFunctionCall(llvm_function,
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 419f19c24db9f2f0f8c6ae81eaa62fec372bdd4a..03bbb2afb587e2f95bcd2743d396d3d996041a21 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -98,13 +98,16 @@ class IrEmitter : public DfsHloVisitorWithDefault {
       bool is_top_level_computation,
       std::vector<const HloInstruction*>* instruction_order);
 
-  llvm::IRBuilder<>* ir_builder() { return &ir_builder_; }
+  llvm::IRBuilder<>* b() { return &b_; }
 
   // Emits a call to `computation` with scalar arguments `arguments`.
   StatusOr<llvm::Value*> EmitScalarCall(
       PrimitiveType return_type, HloComputation* computation,
       const std::vector<llvm::Value*>& arguments, tensorflow::StringPiece name);
 
+  // Emit an LLVM global variable for every constant buffer allocation.
+  Status EmitConstantGlobals();
+
  protected:
   //
   // The following methods implement the DfsHloVisitor interface.
@@ -148,6 +151,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleConcatenate(HloInstruction* concatenate) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleIota(HloInstruction* iota) override;
+  Status HandleRng(HloInstruction* rng) override;
   Status FinishVisit(HloInstruction* root) override;
 
   Status Preprocess(HloInstruction* hlo) override;
@@ -415,7 +420,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // creates the encapsulated llvm::Function s.t. it is added to the llvm
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
-  llvm::IRBuilder<> ir_builder_;
+  llvm::IRBuilder<> b_;
 
   // Maps HLO instructions to their index into the profile counter array.
   const std::unordered_map<const HloInstruction*, int64>
@@ -451,23 +456,22 @@ class IrEmitter : public DfsHloVisitorWithDefault {
         : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
 
     // Record the cycle counter before an HLO executes.
-    void RecordCycleStart(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo);
+    void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
     // Record the number of cycles it took for an HLO to execute.
-    void RecordCycleDelta(llvm::IRBuilder<>* ir_builder, HloInstruction* hlo,
+    void RecordCycleDelta(llvm::IRBuilder<>* b, HloInstruction* hlo,
                           llvm::Value* prof_counter);
     // Record the number of cycles it took for the entire computation to
     // execute.
-    void RecordCompleteComputation(llvm::IRBuilder<>* ir_builder,
+    void RecordCompleteComputation(llvm::IRBuilder<>* b,
                                    llvm::Value* prof_counter);
 
     // Convenience function to generate a call to an intrinsic which reads the
     // CPU cycle counter.
-    llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* ir_builder);
+    llvm::Value* ReadCycleCounter(llvm::IRBuilder<>* b);
 
     // Store the cycle counter delta to the per-HLO profile counter.
-    void UpdateProfileCounter(llvm::IRBuilder<>* ir_builder,
-                              llvm::Value* prof_counter, llvm::Value* cycle_end,
-                              llvm::Value* cycle_start);
+    void UpdateProfileCounter(llvm::IRBuilder<>* b, llvm::Value* prof_counter,
+                              llvm::Value* cycle_end, llvm::Value* cycle_start);
 
    private:
     // Should we use the x86-specific rdtscp or the generic readcyclecounter
@@ -559,6 +563,9 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                            LiteralPtrHashFunctor, LiteralPtrEqualityFunctor>
       emitted_literals_;
 
+  tensorflow::gtl::FlatMap<BufferAllocation::Index, llvm::Constant*>
+      constant_buffer_to_global_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index 2d6f2f3818a7bd4424aaa7d918ca86abef15c0e9..6aff838462ac6bfe8a31971108a721b66dbe45bd 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -49,11 +49,10 @@ IrFunction::IrFunction(const string& function_name,
                        llvm::Function::LinkageTypes linkage,
                        const bool optimize_for_size_requested,
                        const bool enable_fast_math, llvm::Module* llvm_module,
-                       llvm::IRBuilder<>* ir_builder,
-                       int64 num_dynamic_loop_bounds)
-    : ir_builder_(ir_builder),
+                       llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds)
+    : b_(b),
       llvm_module_(llvm_module),
-      caller_insert_point_guard_(*ir_builder),
+      caller_insert_point_guard_(*b),
       num_dynamic_loop_bounds_(num_dynamic_loop_bounds) {
   Initialize(function_name, linkage, optimize_for_size_requested,
              enable_fast_math);
@@ -61,7 +60,7 @@ IrFunction::IrFunction(const string& function_name,
 
 IrFunction::~IrFunction() {
   // Emit function return value.
-  ir_builder_->CreateRetVoid();
+  b_->CreateRetVoid();
 }
 
 DynamicLoopBounds IrFunction::GetDynamicLoopBounds() {
@@ -174,7 +173,7 @@ void IrFunction::Initialize(const string& function_name,
     function_->addAttribute(argument.getArgNo() + 1, llvm::Attribute::NoAlias);
   }
 
-  ir_builder_->SetInsertPoint(llvm::BasicBlock::Create(
+  b_->SetInsertPoint(llvm::BasicBlock::Create(
       /*Context=*/llvm_module_->getContext(),
       /*Name=*/"entry",
       /*Parent=*/function_));
@@ -184,9 +183,8 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
   CHECK_GT(num_dynamic_loop_bounds_, 0);
   CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
   string name = tensorflow::strings::StrCat("dynamic_loop_bound_", offset);
-  return ir_builder_->CreateLoad(
-      ir_builder_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
-                             ir_builder_->getInt64(offset), AsStringRef(name)));
+  return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                                      b_->getInt64(offset), AsStringRef(name)));
 }
 
 // Emits code to allocate an array of parameter address pointers, and store
@@ -195,27 +193,25 @@ llvm::Value* IrFunction::GetDynamicLoopBound(const int64 offset) {
 // address buffer).
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg) {
   llvm::Value* parameter_addresses_buffer =
       llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-          ir_builder->getInt8PtrTy(),
-          ir_builder->getInt32(parameter_addresses.size()),
-          tensorflow::strings::StrCat(name, "_parameter_addresses"),
-          ir_builder);
+          b->getInt8PtrTy(), b->getInt32(parameter_addresses.size()),
+          tensorflow::strings::StrCat(name, "_parameter_addresses"), b);
   for (size_t i = 0; i < parameter_addresses.size(); ++i) {
-    llvm::Value* parameter_as_i8ptr = ir_builder->CreateBitCast(
-        parameter_addresses[i], ir_builder->getInt8PtrTy(),
-        AsStringRef(tensorflow::strings::StrCat(name, "_parameter_", i,
-                                                "_address_as_i8ptr")));
-    llvm::Value* slot_in_param_addresses = ir_builder->CreateInBoundsGEP(
-        parameter_addresses_buffer, {ir_builder->getInt64(i)});
-    ir_builder->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
+    llvm::Value* parameter_as_i8ptr =
+        b->CreateBitCast(parameter_addresses[i], b->getInt8PtrTy(),
+                         AsStringRef(tensorflow::strings::StrCat(
+                             name, "_parameter_", i, "_address_as_i8ptr")));
+    llvm::Value* slot_in_param_addresses =
+        b->CreateInBoundsGEP(parameter_addresses_buffer, {b->getInt64(i)});
+    b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
   }
 
   const auto to_int8_ptr = [=](llvm::Value* ptr) {
-    return ir_builder->CreatePointerCast(ptr, ir_builder->getInt8PtrTy());
+    return b->CreatePointerCast(ptr, b->getInt8PtrTy());
   };
   std::vector<llvm::Value*> arguments{
       to_int8_ptr(return_value_buffer), to_int8_ptr(exec_run_options_arg),
@@ -230,22 +226,21 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 // calls to 'parallel_function' (and joins threads before returning).
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
-    const std::vector<int64>& dimension_partition_counts,
-    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
-    const string& name) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+    const std::vector<int64>& dimension_partition_counts, llvm::IRBuilder<>* b,
+    llvm::Function* parallel_function, const string& name) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
 
   // Build ParallelForkJoin function type.
   std::vector<llvm::Type*> compute_function_params =
       GetComputeFunctionParams(module, /*num_dynamic_loop_bounds=*/0);
   // Number of parallel compute functions.
-  compute_function_params.push_back(ir_builder->getInt32Ty());
+  compute_function_params.push_back(b->getInt32Ty());
   // Array of partitions. There is an array element for each
   // partition x partition_dim x 2 (for dimension start and limit).
   compute_function_params.push_back(
       llvm::Type::getInt64PtrTy(module->getContext()));
   // Number of partitioned most-major dimensions in 'shape'.
-  compute_function_params.push_back(ir_builder->getInt32Ty());
+  compute_function_params.push_back(b->getInt32Ty());
   // Function pointer for compute function to be dispatched in parallel.
   compute_function_params.push_back(
       llvm::Type::getInt8PtrTy(module->getContext()));
@@ -268,7 +263,7 @@ Status EmitCallToParallelForkJoin(
   ShapePartitionIterator partition_iterator(shape, dimension_partition_counts);
   const int64 num_partitions = partition_iterator.GetTotalPartitionCount();
   // Add argument specifying the number of parallel partitions.
-  fork_join_arguments.push_back(ir_builder->getInt32(num_partitions));
+  fork_join_arguments.push_back(b->getInt32(num_partitions));
 
   // The number of partitioned most-major dimensions in 'shape'.
   const int32 num_partitioned_dims = dimension_partition_counts.size();
@@ -293,15 +288,15 @@ Status EmitCallToParallelForkJoin(
       const std::pair<int64, int64>& dim_partition = dim_partitions[j];
       const int32 index = partition_index + j * dim_partition_size;
       // Store partition [dim_start, dim_limit) intervals for each dimension.
-      partitions[index] = ir_builder->getInt64(dim_partition.first);
+      partitions[index] = b->getInt64(dim_partition.first);
       partitions[index + 1] =
-          ir_builder->getInt64(dim_partition.first + dim_partition.second);
+          b->getInt64(dim_partition.first + dim_partition.second);
     }
   }
 
   // Create global variable out of dimension partitions in 'partitions'.
   llvm::ArrayType* partitions_array_type =
-      llvm::ArrayType::get(ir_builder->getInt64Ty(), partition_array_size);
+      llvm::ArrayType::get(b->getInt64Ty(), partition_array_size);
   llvm::Constant* partitions_array =
       llvm::ConstantArray::get(partitions_array_type, partitions);
   llvm::GlobalVariable* global_partitions_array = new llvm::GlobalVariable(
@@ -315,16 +310,16 @@ Status EmitCallToParallelForkJoin(
           tensorflow::strings::StrCat(name, "_parallel_dimension_partitions")));
 
   // Add argument specifying parallel dimension partitions.
-  fork_join_arguments.push_back(ir_builder->CreateBitCast(
-      global_partitions_array,
-      llvm::Type::getInt64PtrTy(module->getContext())));
+  fork_join_arguments.push_back(
+      b->CreateBitCast(global_partitions_array,
+                       llvm::Type::getInt64PtrTy(module->getContext())));
   // Add argument specifying the number of partitioned most-major dimensions.
-  fork_join_arguments.push_back(ir_builder->getInt32(num_partitioned_dims));
+  fork_join_arguments.push_back(b->getInt32(num_partitioned_dims));
   // Add argument for parallel compute function pointer.
   fork_join_arguments.push_back(
-      ir_builder->CreateBitCast(parallel_function, ir_builder->getInt8PtrTy()));
+      b->CreateBitCast(parallel_function, b->getInt8PtrTy()));
   // Emit call to parallel fork/join.
-  ir_builder->CreateCall(fork_join_func, fork_join_arguments);
+  b->CreateCall(fork_join_func, fork_join_arguments);
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 2e55181eed867aca762f2b9b8310624ea12c7487..a41cbb64cdd9f5b6de5d1eadfbf7e63e1e984801 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -54,7 +54,7 @@ class IrFunction {
   IrFunction(const string& function_name, llvm::Function::LinkageTypes linkage,
              const bool optimize_for_size_requested,
              const bool enable_fast_math, llvm::Module* llvm_module,
-             llvm::IRBuilder<>* ir_builder, int64 num_dynamic_loop_bounds);
+             llvm::IRBuilder<>* b, int64 num_dynamic_loop_bounds);
   ~IrFunction();
 
   // Emit ir to read and return the set of ir values representing the dynamic
@@ -97,7 +97,7 @@ class IrFunction {
   // 'offset' from the "dynamic_loop_bounds" argument of this function.
   llvm::Value* GetDynamicLoopBound(int64 offset);
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* llvm_module_;
   llvm::IRBuilder<>::InsertPointGuard caller_insert_point_guard_;
 
@@ -116,7 +116,7 @@ class IrFunction {
 // Returns an array of compute function call argument ir values.
 std::vector<llvm::Value*> GetArrayFunctionCallArguments(
     tensorflow::gtl::ArraySlice<llvm::Value*> parameter_addresses,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece name,
+    llvm::IRBuilder<>* b, tensorflow::StringPiece name,
     llvm::Value* return_value_buffer, llvm::Value* exec_run_options_arg,
     llvm::Value* temp_buffers_arg, llvm::Value* profile_counters_arg);
 
@@ -124,9 +124,8 @@ std::vector<llvm::Value*> GetArrayFunctionCallArguments(
 // calls to 'parallel_function' (and joins threads before returning).
 Status EmitCallToParallelForkJoin(
     const std::vector<llvm::Value*>& arguments, const Shape& shape,
-    const std::vector<int64>& dimension_partition_counts,
-    llvm::IRBuilder<>* ir_builder, llvm::Function* parallel_function,
-    const string& name);
+    const std::vector<int64>& dimension_partition_counts, llvm::IRBuilder<>* b,
+    llvm::Function* parallel_function, const string& name);
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
index 2e5cc96098241415b82f225afc81981f3e1069e0..cef5e57b0b12b7ae93af0d2508b2b9d6a592d390 100644
--- a/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/IR/Verifier.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -52,46 +53,14 @@ llvm::Function* EmitVectorF32TanhIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_tanh_body =
       llvm::BasicBlock::Create(*context, "body", vector_tanh_function);
 
-  llvm::IRBuilder<> ir_builder(vector_tanh_body);
+  llvm::IRBuilder<> b(vector_tanh_body);
   llvm::FastMathFlags fast_math_flags;
-  fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
-
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "tanh_f32");
+  fast_math_flags.setFast(enable_fast_math);
+  b.setFastMathFlags(fast_math_flags);
 
   llvm::Value* input = &*vector_tanh_function->arg_begin();
-  CHECK_EQ(input->getType(), vsl.vector_type());
-
-  // This implements the same rational interpolant as implemented in Eigen3.
-  llvm::Value* input_clamped =
-      vsl.Clamp(input, /*low=*/GetIeeeF32(-9.0), /*high=*/GetIeeeF32(9.0));
-
-  std::array<float, 7> numerator_coeffs{
-      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
-      4.89352455891786e-03f};
-
-  std::array<float, 4> denominator_coeffs{
-      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-      4.89352518554385e-03f};
-
-  llvm::Value* input_squared = vsl.Mul(input_clamped, input_clamped);
-  llvm::Value* numerator = vsl.SplatFloat(GetIeeeF32(numerator_coeffs[0]));
-  for (int i = 1; i < numerator_coeffs.size(); i++) {
-    numerator =
-        vsl.MulAdd(input_squared, numerator, GetIeeeF32(numerator_coeffs[i]));
-  }
-
-  numerator = vsl.Mul(input_clamped, numerator);
-
-  llvm::Value* denominator = vsl.SplatFloat(GetIeeeF32(denominator_coeffs[0]));
-  for (int i = 1; i < denominator_coeffs.size(); i++) {
-    denominator = vsl.MulAdd(input_squared, denominator,
-                             GetIeeeF32(denominator_coeffs[i]));
-  }
-
-  llvm::Value* result = vsl.Div(numerator, denominator);
-  ir_builder.CreateRet(result);
+  CHECK_EQ(vector_width, input->getType()->getVectorNumElements());
+  b.CreateRet(llvm_ir::EmitFastTanh(&b, input));
 
   DCHECK(!llvm::verifyFunction(*vector_tanh_function));
   return vector_tanh_function;
@@ -113,12 +82,12 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_exp_body =
       llvm::BasicBlock::Create(*context, "body", vector_exp_function);
 
-  llvm::IRBuilder<> ir_builder(vector_exp_body);
+  llvm::IRBuilder<> b(vector_exp_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
+  b.setFastMathFlags(fast_math_flags);
 
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "exp_f32");
+  VectorSupportLibrary vsl(F32, vector_width, &b, "exp_f32");
 
   // This implements the same polynomial approximation as implemented in Eigen3.
 
@@ -160,21 +129,21 @@ llvm::Function* EmitVectorF32ExpIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+      b.CreateVectorSplat(vector_width, b.getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b.getInt32Ty(), vector_width);
   // fx is clamped so we don't have to worry about it being out of range for
   // i32.
-  llvm::Value* emm0 = ir_builder.CreateFPToSI(fx, i32_vector_type);
-  emm0 = ir_builder.CreateAdd(emm0, vector_constant_0x7f);
-  emm0 = ir_builder.CreateShl(emm0, vector_constant_23);
-  llvm::Value* emm0_f32 = ir_builder.CreateBitCast(emm0, vsl.vector_type());
+  llvm::Value* emm0 = b.CreateFPToSI(fx, i32_vector_type);
+  emm0 = b.CreateAdd(emm0, vector_constant_0x7f);
+  emm0 = b.CreateShl(emm0, vector_constant_23);
+  llvm::Value* emm0_f32 = b.CreateBitCast(emm0, vsl.vector_type());
 
   llvm::Value* result = vsl.Max(vsl.Mul(y, emm0_f32), input);
 
-  ir_builder.CreateRet(result);
+  b.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_exp_function));
   return vector_exp_function;
@@ -196,13 +165,13 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   llvm::BasicBlock* vector_log_body =
       llvm::BasicBlock::Create(*context, "body", vector_log_function);
 
-  llvm::IRBuilder<> ir_builder(vector_log_body);
+  llvm::IRBuilder<> b(vector_log_body);
   llvm::FastMathFlags fast_math_flags;
   fast_math_flags.setFast();
-  ir_builder.setFastMathFlags(fast_math_flags);
+  b.setFastMathFlags(fast_math_flags);
 
   llvm::Value* input = &*vector_log_function->arg_begin();
-  VectorSupportLibrary vsl(F32, vector_width, &ir_builder, "log_f32");
+  VectorSupportLibrary vsl(F32, vector_width, &b, "log_f32");
 
   const llvm::APFloat half = GetIeeeF32(0.5);
   const llvm::APFloat one = GetIeeeF32(1.0);
@@ -238,22 +207,21 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   // VectorSupportLibrary (intentionally) can't juggle more than one type at a
   // time so drop down to IRBuilder for this bit.
   llvm::Value* vector_constant_0x7f =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(0x7f));
+      b.CreateVectorSplat(vector_width, b.getInt32(0x7f));
   llvm::Value* vector_constant_23 =
-      ir_builder.CreateVectorSplat(vector_width, ir_builder.getInt32(23));
+      b.CreateVectorSplat(vector_width, b.getInt32(23));
   llvm::Type* i32_vector_type =
-      llvm::VectorType::get(ir_builder.getInt32Ty(), vector_width);
+      llvm::VectorType::get(b.getInt32Ty(), vector_width);
 
-  llvm::Value* emm0 = ir_builder.CreateLShr(
-      ir_builder.CreateBitCast(input, i32_vector_type), vector_constant_23);
+  llvm::Value* emm0 =
+      b.CreateLShr(b.CreateBitCast(input, i32_vector_type), vector_constant_23);
 
   // Keep only the fractional part.
   input = vsl.FloatAnd(input, inv_mant_mask);
   input = vsl.FloatOr(input, half);
 
-  emm0 = ir_builder.CreateSub(emm0, vector_constant_0x7f);
-  llvm::Value* e =
-      vsl.Add(one, ir_builder.CreateSIToFP(emm0, vsl.vector_type()));
+  emm0 = b.CreateSub(emm0, vector_constant_0x7f);
+  llvm::Value* e = vsl.Add(one, b.CreateSIToFP(emm0, vsl.vector_type()));
 
   // part2:
   //   if( x < SQRTHF ) {
@@ -294,7 +262,7 @@ llvm::Function* EmitVectorF32LogIfNeeded(llvm::Module* module,
   llvm::Value* or_rhs = vsl.FloatAnd(iszero_mask, minus_inf);
   llvm::Value* result = vsl.FloatOr(or_lhs, or_rhs);
 
-  ir_builder.CreateRet(result);
+  b.CreateRet(result);
 
   DCHECK(!llvm::verifyFunction(*vector_log_function));
   return vector_log_function;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index 59ae5acd8b7cea049f09eaf4cc98b41339973c77..8560e4296aa95fe791446abb1b4363b9145f343e 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -25,8 +25,8 @@ namespace cpu {
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* ir_builder)
-    : LoopEmitter(target_element_generator, target_array, ir_builder),
+    const DynamicLoopBounds* dynamic_loop_bounds, llvm::IRBuilder<>* b)
+    : LoopEmitter(target_element_generator, target_array, b),
       dynamic_loop_bounds_(dynamic_loop_bounds) {}
 
 std::vector<llvm_ir::IrArray::Index>
@@ -37,7 +37,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   CHECK(!ShapeUtil::IsTuple(shape_));
   CHECK(!ShapeUtil::IsScalar(shape_));
 
-  llvm_ir::ForLoopNest loop_nest(loop_name, ir_builder_);
+  llvm_ir::ForLoopNest loop_nest(loop_name, b_);
   const int64 num_dims = shape_.dimensions_size();
   llvm_ir::IrArray::Index array_index(index_type, num_dims);
 
@@ -65,8 +65,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
     }
   }
   // Point IR builder at inner loop BB.
-  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(),
-                                 ir_builder_);
+  llvm_ir::SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b_);
 
   // Set exit_bb_ to the exit block of the loop nest.
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index 25e182a26d6f21c7eba550020cf17403aa92abf7..076c683ca566f2c53992c358903d2aadead290f9 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -54,7 +54,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const DynamicLoopBounds* dynamic_loop_bounds,
-                      llvm::IRBuilder<>* ir_builder);
+                      llvm::IRBuilder<>* b);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
   ParallelLoopEmitter& operator=(const ParallelLoopEmitter&) = delete;
diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
index d9e8dcaed98dc8bc33bf1355b0acddb56faa3c71..f227e4ae139b92e56786e38ef8eef72c9e2cd424 100644
--- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc
+++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index b4c33e2f6cad8455e1e4ff2f020a167121e80a6f..181cec3cdddeb40daf5276d9d1d6a139417a6072 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -135,9 +135,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
index 0d45918d0990c0f00c08f321dad015cfbc038bf3..c433bddc8432949905041b5e9e31fc6af9e8bd44 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index ccb61740f6b717ce7dc2a6f614d6d2c8b4d7a9a5..01daed4bcd38323bfe33e798a78c2b00b150a1bc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -78,7 +78,7 @@ TEST_F(CpuNoAliasTest, Concat) {
   llvm::Function* func = llvm::cast<llvm::Function>(
       ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context)));
   llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func);
-  llvm::IRBuilder<> ir_builder(bb);
+  llvm::IRBuilder<> b(bb);
   auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0);
   llvm_ir::IrArray::Index zero2D({zero, zero});
 
@@ -90,7 +90,7 @@ TEST_F(CpuNoAliasTest, Concat) {
         ir_module.getOrInsertGlobal("param_x", array2d_type);
     llvm_ir::IrArray param_x_array(param_x_val, param_shape);
     aa.AddAliasingInformationToIrArray(*param_x, &param_x_array);
-    param_x_array.EmitReadArrayElement(zero2D, &ir_builder)
+    param_x_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_param_x_array");
   }
 
@@ -100,7 +100,7 @@ TEST_F(CpuNoAliasTest, Concat) {
     auto shape = ShapeUtil::MakeShape(F32, {2, 4});
     llvm_ir::IrArray concat1_array(concat1_val, shape);
     aa.AddAliasingInformationToIrArray(*concat1, &concat1_array);
-    concat1_array.EmitReadArrayElement(zero2D, &ir_builder)
+    concat1_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_concat1_array");
   }
 
@@ -110,7 +110,7 @@ TEST_F(CpuNoAliasTest, Concat) {
     auto shape = ShapeUtil::MakeShape(F32, {2, 6});
     llvm_ir::IrArray concat2_array(concat2_val, shape);
     aa.AddAliasingInformationToIrArray(*concat2, &concat2_array);
-    concat2_array.EmitReadArrayElement(zero2D, &ir_builder)
+    concat2_array.EmitReadArrayElement(zero2D, &b)
         ->setName("read_concat2_array");
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index c444d151858d3a152a01b99657ffae89ebc6b487..3274be8d9dbfaa55e250748a389ad34fdeb81922 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -23,14 +23,14 @@ namespace xla {
 namespace cpu {
 VectorSupportLibrary::VectorSupportLibrary(PrimitiveType primitive_type,
                                            int64 vector_size,
-                                           llvm::IRBuilder<>* ir_builder,
+                                           llvm::IRBuilder<>* b,
                                            std::string name)
     : vector_size_(vector_size),
       primitive_type_(primitive_type),
-      ir_builder_(ir_builder),
+      b_(b),
       name_(std::move(name)) {
   scalar_type_ = llvm_ir::PrimitiveTypeToIrType(
-      primitive_type, ir_builder_->GetInsertBlock()->getModule());
+      primitive_type, b_->GetInsertBlock()->getModule());
   scalar_pointer_type_ = llvm::PointerType::getUnqual(scalar_type_);
   vector_type_ = llvm::VectorType::get(scalar_type_, vector_size);
   vector_pointer_type_ = llvm::PointerType::getUnqual(vector_type_);
@@ -63,9 +63,9 @@ llvm::Value* VectorSupportLibrary::Mul(llvm::Value* lhs, llvm::Value* rhs) {
 llvm::Value* VectorSupportLibrary::MulInternal(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFMul(lhs, rhs, name());
+    return b()->CreateFMul(lhs, rhs, name());
   } else {
-    return ir_builder()->CreateMul(lhs, rhs, name());
+    return b()->CreateMul(lhs, rhs, name());
   }
 }
 
@@ -76,13 +76,13 @@ llvm::Value* VectorSupportLibrary::Add(llvm::Value* lhs, llvm::Value* rhs) {
 
 llvm::Value* VectorSupportLibrary::Sub(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return ir_builder()->CreateFSub(lhs, rhs);
+  return b()->CreateFSub(lhs, rhs);
 }
 
 llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return llvm_ir::EmitFloatMax(lhs, rhs, ir_builder_);
+    return llvm_ir::EmitFloatMax(lhs, rhs, b_);
   } else {
     LOG(FATAL) << "Max for integers is unimplemented";
   }
@@ -91,13 +91,13 @@ llvm::Value* VectorSupportLibrary::Max(llvm::Value* lhs, llvm::Value* rhs) {
 llvm::Value* VectorSupportLibrary::Floor(llvm::Value* a) {
   AssertCorrectTypes({a});
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor, {a},
-                                      {a->getType()}, ir_builder());
+                                      {a->getType()}, b());
 }
 
 llvm::Value* VectorSupportLibrary::Div(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFDiv(lhs, rhs, name());
+    return b()->CreateFDiv(lhs, rhs, name());
   } else {
     LOG(FATAL) << "Division for integers is unimplemented";
   }
@@ -111,42 +111,41 @@ llvm::Value* VectorSupportLibrary::Clamp(llvm::Value* a,
   CHECK(low.compare(high) == llvm::APFloat::cmpLessThan);
   CHECK(scalar_type_->isFloatingPointTy());
   return llvm_ir::EmitFloatMin(
-      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), ir_builder_),
-      GetConstantFloat(type, high), ir_builder_);
+      llvm_ir::EmitFloatMax(a, GetConstantFloat(type, low), b_),
+      GetConstantFloat(type, high), b_);
 }
 
 llvm::Value* VectorSupportLibrary::FCmpEQMask(llvm::Value* lhs,
                                               llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpOEQ(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpOEQ(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::FCmpOLTMask(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpOLT(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpOLT(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::FCmpULEMask(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
-  return I1ToFloat(ir_builder()->CreateFCmpULE(lhs, rhs, name()));
+  return I1ToFloat(b()->CreateFCmpULE(lhs, rhs, name()));
 }
 
 llvm::Value* VectorSupportLibrary::I1ToFloat(llvm::Value* i1) {
   bool is_vector = llvm::isa<llvm::VectorType>(i1->getType());
   llvm::Type* integer_type = IntegerTypeForFloatSize(is_vector);
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateSExt(i1, integer_type, name()),
-      is_vector ? vector_type() : scalar_type(), name());
+  return b()->CreateBitCast(b()->CreateSExt(i1, integer_type, name()),
+                            is_vector ? vector_type() : scalar_type(), name());
 }
 
 llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
   CHECK(scalar_type()->isFloatingPointTy());
   const llvm::DataLayout& data_layout =
-      ir_builder()->GetInsertBlock()->getModule()->getDataLayout();
+      b()->GetInsertBlock()->getModule()->getDataLayout();
   int64 float_size_bits = data_layout.getTypeSizeInBits(scalar_type());
-  llvm::Type* scalar_int_type = ir_builder()->getIntNTy(float_size_bits);
+  llvm::Type* scalar_int_type = b()->getIntNTy(float_size_bits);
   if (vector) {
     return llvm::VectorType::get(scalar_int_type, vector_size());
   } else {
@@ -156,7 +155,7 @@ llvm::Type* VectorSupportLibrary::IntegerTypeForFloatSize(bool vector) {
 
 llvm::Value* VectorSupportLibrary::BroadcastScalar(llvm::Value* x) {
   CHECK_EQ(x->getType(), scalar_type());
-  return ir_builder()->CreateVectorSplat(vector_size(), x, name());
+  return b()->CreateVectorSplat(vector_size(), x, name());
 }
 
 llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs,
@@ -164,10 +163,9 @@ llvm::Value* VectorSupportLibrary::FloatAnd(llvm::Value* lhs,
   AssertCorrectTypes({lhs, rhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateAnd(
-          ir_builder()->CreateBitCast(lhs, int_type, name()),
-          ir_builder()->CreateBitCast(rhs, int_type, name()), name()),
+  return b()->CreateBitCast(
+      b()->CreateAnd(b()->CreateBitCast(lhs, int_type, name()),
+                     b()->CreateBitCast(rhs, int_type, name()), name()),
       vector_type());
 }
 
@@ -175,9 +173,8 @@ llvm::Value* VectorSupportLibrary::FloatNot(llvm::Value* lhs) {
   AssertCorrectTypes({lhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateNot(
-          ir_builder()->CreateBitCast(lhs, int_type, name()), name()),
+  return b()->CreateBitCast(
+      b()->CreateNot(b()->CreateBitCast(lhs, int_type, name()), name()),
       vector_type());
 }
 
@@ -185,47 +182,43 @@ llvm::Value* VectorSupportLibrary::FloatOr(llvm::Value* lhs, llvm::Value* rhs) {
   AssertCorrectTypes({lhs, rhs});
   llvm::Type* int_type =
       IntegerTypeForFloatSize(lhs->getType() == vector_type());
-  return ir_builder()->CreateBitCast(
-      ir_builder()->CreateOr(ir_builder()->CreateBitCast(lhs, int_type, name()),
-                             ir_builder()->CreateBitCast(rhs, int_type, name()),
-                             name()),
+  return b()->CreateBitCast(
+      b()->CreateOr(b()->CreateBitCast(lhs, int_type, name()),
+                    b()->CreateBitCast(rhs, int_type, name()), name()),
       vector_type(), name());
 }
 
 llvm::Value* VectorSupportLibrary::AddInternal(llvm::Value* lhs,
                                                llvm::Value* rhs) {
   if (scalar_type_->isFloatingPointTy()) {
-    return ir_builder()->CreateFAdd(lhs, rhs, name());
+    return b()->CreateFAdd(lhs, rhs, name());
   } else {
-    return ir_builder()->CreateAdd(lhs, rhs, name());
+    return b()->CreateAdd(lhs, rhs, name());
   }
 }
 
 llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
     llvm::Value* base_pointer, llvm::Value* offset_elements) {
   if (base_pointer->getType() != scalar_pointer_type()) {
-    base_pointer = ir_builder()->CreateBitCast(base_pointer,
-                                               scalar_pointer_type(), name());
+    base_pointer =
+        b()->CreateBitCast(base_pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateInBoundsGEP(base_pointer, {offset_elements},
-                                         name());
+  return b()->CreateInBoundsGEP(base_pointer, {offset_elements}, name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
   if (pointer->getType() != vector_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, vector_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, vector_pointer_type(), name());
   }
-  return ir_builder()->CreateAlignedLoad(
+  return b()->CreateAlignedLoad(
       pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateAlignedLoad(
+  return b()->CreateAlignedLoad(
       pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_), name());
 }
 
@@ -233,30 +226,28 @@ void VectorSupportLibrary::StoreVector(llvm::Value* value,
                                        llvm::Value* pointer) {
   AssertCorrectTypes({value});
   if (pointer->getType() != vector_pointer_type()) {
-    pointer = ir_builder()->CreateBitCast(pointer, vector_pointer_type());
+    pointer = b()->CreateBitCast(pointer, vector_pointer_type());
   }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+  b()->CreateAlignedStore(value, pointer,
+                          ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
 }
 
 void VectorSupportLibrary::StoreScalar(llvm::Value* value,
                                        llvm::Value* pointer) {
   AssertCorrectTypes({value});
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  ir_builder()->CreateAlignedStore(
-      value, pointer, ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
+  b()->CreateAlignedStore(value, pointer,
+                          ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_));
 }
 
 llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
   if (pointer->getType() != scalar_pointer_type()) {
-    pointer =
-        ir_builder()->CreateBitCast(pointer, scalar_pointer_type(), name());
+    pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  return ir_builder()->CreateVectorSplat(
-      vector_size(), ir_builder()->CreateLoad(pointer), name());
+  return b()->CreateVectorSplat(vector_size(), b()->CreateLoad(pointer),
+                                name());
 }
 
 llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
@@ -267,20 +258,19 @@ llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
 
     for (unsigned j = 0; j < vector_size(); ++j) {
       if (j < (i / 2)) {
-        mask[j] = ir_builder()->getInt32(i / 2 + j);
+        mask[j] = b()->getInt32(i / 2 + j);
       } else {
-        mask[j] = llvm::UndefValue::get(ir_builder()->getInt32Ty());
+        mask[j] = llvm::UndefValue::get(b()->getInt32Ty());
       }
     }
 
-    llvm::Value* half_remaining_lanes = ir_builder()->CreateShuffleVector(
-        vector, llvm::UndefValue::get(vector_type()),
-        llvm::ConstantVector::get(mask), "");
+    llvm::Value* half_remaining_lanes =
+        b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                 llvm::ConstantVector::get(mask), "");
     vector = Add(vector, half_remaining_lanes);
   }
 
-  return ir_builder()->CreateExtractElement(vector, ir_builder()->getInt32(0),
-                                            name());
+  return b()->CreateExtractElement(vector, b()->getInt32(0), name());
 }
 
 llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
@@ -307,19 +297,19 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
   // vector, which are the lanes 2 and 3 in the rhs vector.
   for (int i = 0; i < vector_size(); i += 2) {
     int increment = i < vector_size() / 2 ? 0 : (vector_size() / 2);
-    mask_a.push_back(ir_builder()->getInt32(increment + i));
-    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+    mask_a.push_back(b()->getInt32(increment + i));
+    mask_b.push_back(b()->getInt32(increment + i + 1));
   }
   for (int i = 0; i < vector_size(); i += 2) {
     int increment = i < vector_size() / 2 ? (vector_size() / 2) : vector_size();
-    mask_a.push_back(ir_builder()->getInt32(increment + i));
-    mask_b.push_back(ir_builder()->getInt32(increment + i + 1));
+    mask_a.push_back(b()->getInt32(increment + i));
+    mask_b.push_back(b()->getInt32(increment + i + 1));
   }
 
-  llvm::Value* shuffle_0 = ir_builder()->CreateShuffleVector(
-      lhs, rhs, llvm::ConstantVector::get(mask_a));
-  llvm::Value* shuffle_1 = ir_builder()->CreateShuffleVector(
-      lhs, rhs, llvm::ConstantVector::get(mask_b));
+  llvm::Value* shuffle_0 =
+      b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_a));
+  llvm::Value* shuffle_1 =
+      b()->CreateShuffleVector(lhs, rhs, llvm::ConstantVector::get(mask_b));
 
   return Add(shuffle_0, shuffle_1);
 }
@@ -327,23 +317,21 @@ llvm::Value* VectorSupportLibrary::AvxStyleHorizontalAdd(llvm::Value* lhs,
 llvm::Value* VectorSupportLibrary::ExtractLowHalf(llvm::Value* vector) {
   llvm::SmallVector<llvm::Constant*, 32> mask;
   for (int i = 0; i < vector_size() / 2; i++) {
-    mask.push_back(ir_builder()->getInt32(i));
+    mask.push_back(b()->getInt32(i));
   }
 
-  return ir_builder()->CreateShuffleVector(vector,
-                                           llvm::UndefValue::get(vector_type()),
-                                           llvm::ConstantVector::get(mask));
+  return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                  llvm::ConstantVector::get(mask));
 }
 
 llvm::Value* VectorSupportLibrary::ExtractHighHalf(llvm::Value* vector) {
   llvm::SmallVector<llvm::Constant*, 32> mask;
   for (int i = 0; i < vector_size() / 2; i++) {
-    mask.push_back(ir_builder()->getInt32(i + vector_size() / 2));
+    mask.push_back(b()->getInt32(i + vector_size() / 2));
   }
 
-  return ir_builder()->CreateShuffleVector(vector,
-                                           llvm::UndefValue::get(vector_type()),
-                                           llvm::ConstantVector::get(mask));
+  return b()->CreateShuffleVector(vector, llvm::UndefValue::get(vector_type()),
+                                  llvm::ConstantVector::get(mask));
 }
 
 std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
@@ -360,8 +348,8 @@ std::vector<llvm::Value*> VectorSupportLibrary::ComputeHorizontalSums(
                  [this](llvm::Value* vector) { return AddReduce(vector); });
   if (init_values) {
     for (int64 i = 0, e = result.size(); i < e; i++) {
-      result[i] = Add(result[i], ir_builder()->CreateExtractElement(
-                                     init_values, ir_builder()->getInt32(i)));
+      result[i] = Add(result[i],
+                      b()->CreateExtractElement(init_values, b()->getInt32(i)));
     }
   }
   return result;
@@ -398,9 +386,9 @@ VectorSupportLibrary::ComputeAvxOptimizedHorizontalSums(
 
   std::vector<llvm::Value*> results;
   for (int i = 0; i < lane_width; i++) {
-    llvm::Value* scalar_result = ir_builder()->CreateExtractElement(
-        i < (lane_width / 2) ? low : high,
-        ir_builder()->getInt32(i % (lane_width / 2)), name());
+    llvm::Value* scalar_result =
+        b()->CreateExtractElement(i < (lane_width / 2) ? low : high,
+                                  b()->getInt32(i % (lane_width / 2)), name());
     results.push_back(scalar_result);
   }
 
@@ -415,17 +403,14 @@ llvm::Value* VectorSupportLibrary::GetZeroScalar() {
   return llvm::Constant::getNullValue(scalar_type());
 }
 
-LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* ir_builder)
-    : ir_builder_(ir_builder) {
-  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", ir_builder_);
+LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* b) : b_(b) {
+  alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", b_);
 }
 
-llvm::Value* LlvmVariable::Get() const {
-  return ir_builder_->CreateLoad(alloca_);
-}
+llvm::Value* LlvmVariable::Get() const { return b_->CreateLoad(alloca_); }
 
 void LlvmVariable::Set(llvm::Value* new_value) {
-  ir_builder_->CreateStore(new_value, alloca_);
+  b_->CreateStore(new_value, alloca_);
 }
 
 TileVariable::TileVariable(VectorSupportLibrary* vector_support,
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 49c2a4e2f4bae9e1672b7d2fe891301bce08bd4b..c728f6df0aef83e6ddc6c932a347f14da06d9d0d 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -46,11 +46,11 @@ class VectorSupportLibrary {
   // instance (i.e. LoadVector will load a vector of type <`vector_size` x
   // `primitive_type`>).
   VectorSupportLibrary(PrimitiveType primitive_type, int64 vector_size,
-                       llvm::IRBuilder<>* ir_builder, std::string name);
+                       llvm::IRBuilder<>* b, std::string name);
 
   llvm::Value* Mul(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Mul(int64 lhs, llvm::Value* rhs) {
-    return Mul(ir_builder()->getInt64(lhs), rhs);
+    return Mul(b()->getInt64(lhs), rhs);
   }
   llvm::Value* Mul(const llvm::APFloat& lhs, llvm::Value* rhs) {
     return Mul(GetConstantFloat(rhs->getType(), lhs), rhs);
@@ -63,7 +63,7 @@ class VectorSupportLibrary {
 
   llvm::Value* Add(llvm::Value* lhs, llvm::Value* rhs);
   llvm::Value* Add(int64 lhs, llvm::Value* rhs) {
-    return Add(ir_builder()->getInt64(lhs), rhs);
+    return Add(b()->getInt64(lhs), rhs);
   }
   llvm::Value* Add(const llvm::APFloat& lhs, llvm::Value* rhs) {
     return Add(GetConstantFloat(rhs->getType(), lhs), rhs);
@@ -147,13 +147,11 @@ class VectorSupportLibrary {
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements, int64 scale) {
     return ComputeOffsetPointer(
-        base_pointer,
-        ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements));
+        base_pointer, b_->CreateMul(b_->getInt64(scale), offset_elements));
   }
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     int64 offset_elements) {
-    return ComputeOffsetPointer(base_pointer,
-                                ir_builder()->getInt64(offset_elements));
+    return ComputeOffsetPointer(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadVector(llvm::Value* pointer);
@@ -164,7 +162,7 @@ class VectorSupportLibrary {
   }
 
   llvm::Value* LoadVector(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadVector(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadVector(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadScalar(llvm::Value* pointer);
@@ -175,7 +173,7 @@ class VectorSupportLibrary {
   }
 
   llvm::Value* LoadScalar(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadScalar(base_pointer, b()->getInt64(offset_elements));
   }
 
   void StoreVector(llvm::Value* value, llvm::Value* pointer);
@@ -187,7 +185,7 @@ class VectorSupportLibrary {
 
   void StoreVector(llvm::Value* value, llvm::Value* base_pointer,
                    int64 offset_elements) {
-    StoreVector(value, base_pointer, ir_builder()->getInt64(offset_elements));
+    StoreVector(value, base_pointer, b()->getInt64(offset_elements));
   }
 
   void StoreScalar(llvm::Value* value, llvm::Value* pointer);
@@ -198,7 +196,7 @@ class VectorSupportLibrary {
 
   void StoreScalar(llvm::Value* value, llvm::Value* base_pointer,
                    int64 offset_elements) {
-    StoreScalar(base_pointer, ir_builder()->getInt64(offset_elements));
+    StoreScalar(base_pointer, b()->getInt64(offset_elements));
   }
 
   llvm::Value* LoadBroadcast(llvm::Value* pointer);
@@ -207,7 +205,7 @@ class VectorSupportLibrary {
     return LoadBroadcast(ComputeOffsetPointer(base_pointer, offset_elements));
   }
   llvm::Value* LoadBroadcast(llvm::Value* base_pointer, int64 offset_elements) {
-    return LoadBroadcast(base_pointer, ir_builder()->getInt64(offset_elements));
+    return LoadBroadcast(base_pointer, b()->getInt64(offset_elements));
   }
 
   // Compute the horizontal sum of each vector in `vectors`.  The i'th element
@@ -220,7 +218,7 @@ class VectorSupportLibrary {
   llvm::Value* GetZeroVector();
   llvm::Value* GetZeroScalar();
 
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  llvm::IRBuilder<>* b() const { return b_; }
   int64 vector_size() const { return vector_size_; }
   llvm::Type* vector_type() const { return vector_type_; }
   llvm::Type* vector_pointer_type() const { return vector_pointer_type_; }
@@ -277,7 +275,7 @@ class VectorSupportLibrary {
 
   int64 vector_size_;
   PrimitiveType primitive_type_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Type* vector_type_;
   llvm::Type* vector_pointer_type_;
   llvm::Type* scalar_type_;
@@ -289,22 +287,21 @@ class VectorSupportLibrary {
 // can later convert to a SSA value.
 class LlvmVariable {
  public:
-  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* ir_builder);
+  LlvmVariable(llvm::Type*, llvm::IRBuilder<>* b);
 
   llvm::Value* Get() const;
   void Set(llvm::Value* new_value);
 
  private:
   llvm::AllocaInst* alloca_;
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 };
 
 class VectorVariable : public LlvmVariable {
  public:
   VectorVariable(VectorSupportLibrary* vector_support,
                  llvm::Value* initial_value)
-      : LlvmVariable(vector_support->vector_type(),
-                     vector_support->ir_builder()) {
+      : LlvmVariable(vector_support->vector_type(), vector_support->b()) {
     Set(initial_value);
   }
 };
@@ -313,8 +310,7 @@ class ScalarVariable : public LlvmVariable {
  public:
   ScalarVariable(VectorSupportLibrary* vector_support,
                  llvm::Value* initial_value)
-      : LlvmVariable(vector_support->scalar_type(),
-                     vector_support->ir_builder()) {
+      : LlvmVariable(vector_support->scalar_type(), vector_support->b()) {
     Set(initial_value);
   }
 };
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 51f16bdc94777fe027991cb756322335eecda81f..097fa23027bf55ad0b92c347c5a1209bb5836695 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -212,6 +212,7 @@ class DfsHloVisitorBase {
   virtual Status HandleReverse(HloInstructionPtr hlo) = 0;
   virtual Status HandleSort(HloInstructionPtr hlo) = 0;
   virtual Status HandleConstant(HloInstructionPtr hlo) = 0;
+  virtual Status HandleIota(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetTupleElement(HloInstructionPtr hlo) = 0;
   virtual Status HandleReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleBitcast(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 0686ca74afcde541e89ed3d16b79914de7d06bb3..f4316e0fb77855aad1c4710908df09c604da896e 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -115,6 +115,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleConstant(HloInstructionPtr constant) override {
     return DefaultAction(constant);
   }
+  Status HandleIota(HloInstructionPtr iota) override {
+    return DefaultAction(iota);
+  }
   Status HandleGetTupleElement(HloInstructionPtr get_tuple_element) override {
     return DefaultAction(get_tuple_element);
   }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index c51632597ac038cc984bad9862631efa411a4d56..f883eb828c7f6365dfd4d5e0b514dc6894adc12b 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -61,13 +61,13 @@ int64 GlobalRandomValue() {
 
 llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
                                       int64 mantissa_bits,
-                                      llvm::IRBuilder<>* ir_builder) {
+                                      llvm::IRBuilder<>* b) {
   // Integer and float types for casting and constant generation.
   llvm::Type* float_type = x->getType();
-  llvm::IntegerType* int_type = ir_builder->getInt32Ty();
+  llvm::IntegerType* int_type = b->getInt32Ty();
 
   // Cast the input value to an integer for bitwise manipulation.
-  llvm::Value* x_as_int = ir_builder->CreateBitCast(x, int_type);
+  llvm::Value* x_as_int = b->CreateBitCast(x, int_type);
 
   if (mantissa_bits < 23) {
     // Last remaining mantissa bit.
@@ -77,22 +77,22 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
     // equal to a base value of 0111... plus one bit if the last remaining
     // mantissa bit is 1.
     const uint32_t base_rounding_bias = (last_mantissa_bit_mask >> 1) - 1;
-    llvm::Value* x_last_mantissa_bit = ir_builder->CreateLShr(
-        ir_builder->CreateAnd(
-            x_as_int, llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
+    llvm::Value* x_last_mantissa_bit = b->CreateLShr(
+        b->CreateAnd(x_as_int,
+                     llvm::ConstantInt::get(int_type, last_mantissa_bit_mask)),
         (23 - mantissa_bits));
-    llvm::Value* x_rounding_bias = ir_builder->CreateAdd(
-        x_last_mantissa_bit,
-        llvm::ConstantInt::get(int_type, base_rounding_bias));
+    llvm::Value* x_rounding_bias =
+        b->CreateAdd(x_last_mantissa_bit,
+                     llvm::ConstantInt::get(int_type, base_rounding_bias));
 
     // Add rounding bias, and mask out truncated bits.  Note that the case
     // where adding the rounding bias overflows into the exponent bits is
     // correct; the non-masked mantissa bits will all be zero, and the
     // exponent will be incremented by one.
     const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-    x_as_int = ir_builder->CreateAdd(x_as_int, x_rounding_bias);
-    x_as_int = ir_builder->CreateAnd(
-        x_as_int, llvm::ConstantInt::get(int_type, truncation_mask));
+    x_as_int = b->CreateAdd(x_as_int, x_rounding_bias);
+    x_as_int = b->CreateAnd(x_as_int,
+                            llvm::ConstantInt::get(int_type, truncation_mask));
   }
 
   if (exponent_bits < 8) {
@@ -120,29 +120,29 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
         f32_exponent_bias - reduced_exponent_bias;
 
     // Do we overflow or underflow?
-    llvm::Value* x_exponent = ir_builder->CreateAnd(
+    llvm::Value* x_exponent = b->CreateAnd(
         x_as_int, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
-    llvm::Value* x_overflows = ir_builder->CreateICmpUGT(
+    llvm::Value* x_overflows = b->CreateICmpUGT(
         x_exponent,
         llvm::ConstantInt::get(int_type, reduced_max_exponent << 23));
-    llvm::Value* x_underflows = ir_builder->CreateICmpULE(
+    llvm::Value* x_underflows = b->CreateICmpULE(
         x_exponent,
         llvm::ConstantInt::get(int_type, reduced_min_exponent << 23));
 
     // Compute appropriately-signed values of zero and infinity.
-    llvm::Value* x_signed_zero = ir_builder->CreateAnd(
+    llvm::Value* x_signed_zero = b->CreateAnd(
         x_as_int, llvm::ConstantInt::get(int_type, f32_sign_bit_mask));
-    llvm::Value* x_signed_inf = ir_builder->CreateOr(
+    llvm::Value* x_signed_inf = b->CreateOr(
         x_signed_zero, llvm::ConstantInt::get(int_type, f32_exp_bits_mask));
 
     // Force to zero or infinity if overflow or underflow.  (Note that this
     // truncates all denormal values to zero, rather than rounding them.)
-    x_as_int = ir_builder->CreateSelect(x_overflows, x_signed_inf, x_as_int);
-    x_as_int = ir_builder->CreateSelect(x_underflows, x_signed_zero, x_as_int);
+    x_as_int = b->CreateSelect(x_overflows, x_signed_inf, x_as_int);
+    x_as_int = b->CreateSelect(x_underflows, x_signed_zero, x_as_int);
   }
 
   // Cast the result back to a floating-point type.
-  llvm::Value* result = ir_builder->CreateBitCast(x_as_int, float_type);
+  llvm::Value* result = b->CreateBitCast(x_as_int, float_type);
 
   // Correct result for NaN inputs.
   //
@@ -154,53 +154,49 @@ llvm::Value* EmitReducePrecisionFloat(llvm::Value* x, int64 exponent_bits,
   //
   // If the fast-math flags are set to assume no NaNs, the comparison is likely
   // to be optimized away, so there's no point in even emitting it.
-  if (!ir_builder->getFastMathFlags().noNaNs()) {
-    llvm::Value* x_is_nan = ir_builder->CreateFCmpUNO(x, x);
+  if (!b->getFastMathFlags().noNaNs()) {
+    llvm::Value* x_is_nan = b->CreateFCmpUNO(x, x);
 
     if (mantissa_bits > 0) {
-      result = ir_builder->CreateSelect(x_is_nan, x, result);
+      result = b->CreateSelect(x_is_nan, x, result);
     } else {
-      result = ir_builder->CreateSelect(
+      result = b->CreateSelect(
           x_is_nan, llvm::ConstantFP::getInfinity(float_type), result);
     }
   }
   return result;
 }
 
-llvm::Value* EmitF32ToBF16(llvm::Value* f32_value,
-                           llvm::IRBuilder<>* ir_builder) {
+llvm::Value* EmitF32ToBF16(llvm::Value* f32_value, llvm::IRBuilder<>* b) {
   auto reduced_precision = EmitReducePrecisionFloat(
       f32_value,
       /*exponent_bits=*/primitive_util::kBFloat16ExponentBits,
-      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, ir_builder);
-  auto as_int32 =
-      ir_builder->CreateBitCast(reduced_precision, ir_builder->getInt32Ty());
-  auto shifted = ir_builder->CreateLShr(as_int32, 16);
-  auto truncated = ir_builder->CreateTrunc(shifted, ir_builder->getInt16Ty());
-  return ir_builder->CreateBitCast(truncated, ir_builder->getInt16Ty());
+      /*mantissa_bits=*/primitive_util::kBFloat16MantissaBits, b);
+  auto as_int32 = b->CreateBitCast(reduced_precision, b->getInt32Ty());
+  auto shifted = b->CreateLShr(as_int32, 16);
+  auto truncated = b->CreateTrunc(shifted, b->getInt16Ty());
+  return b->CreateBitCast(truncated, b->getInt16Ty());
 }
 
-llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value,
-                           llvm::IRBuilder<>* ir_builder) {
-  auto as_int16 =
-      ir_builder->CreateBitCast(bf16_value, ir_builder->getInt16Ty());
-  auto as_int32 = ir_builder->CreateZExt(as_int16, ir_builder->getInt32Ty());
-  auto shifted = ir_builder->CreateShl(as_int32, 16);
-  return ir_builder->CreateBitCast(shifted, ir_builder->getFloatTy());
+llvm::Value* EmitBF16ToF32(llvm::Value* bf16_value, llvm::IRBuilder<>* b) {
+  auto as_int16 = b->CreateBitCast(bf16_value, b->getInt16Ty());
+  auto as_int32 = b->CreateZExt(as_int16, b->getInt32Ty());
+  auto shifted = b->CreateShl(as_int32, 16);
+  return b->CreateBitCast(shifted, b->getFloatTy());
 }
 
 llvm::Value* EmitIntegralToFloating(llvm::Value* integer_value,
                                     PrimitiveType from_type,
                                     PrimitiveType to_type, llvm::Module* module,
-                                    llvm::IRBuilder<>* ir_builder) {
+                                    llvm::IRBuilder<>* b) {
   if (primitive_util::IsSignedIntegralType(from_type)) {
-    return ir_builder->CreateSIToFP(
-        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+    return b->CreateSIToFP(integer_value,
+                           llvm_ir::PrimitiveTypeToIrType(to_type, module));
   } else {
     CHECK(primitive_util::IsUnsignedIntegralType(from_type) ||
           from_type == PRED);
-    return ir_builder->CreateUIToFP(
-        integer_value, llvm_ir::PrimitiveTypeToIrType(to_type, module));
+    return b->CreateUIToFP(integer_value,
+                           llvm_ir::PrimitiveTypeToIrType(to_type, module));
   }
 }
 
@@ -226,39 +222,43 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       PrimitiveType to_type = op->shape().element_type();
-      CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED);
+      CHECK(primitive_util::IsIntegralType(from_type) || from_type == PRED)
+          << from_type;
       if (from_type == to_type) {
         return operand_value;
       }
+      if (to_type == PRED) {
+        return b_->CreateZExt(
+            b_->CreateICmpNE(operand_value, llvm::ConstantInt::get(
+                                                operand_value->getType(), 0)),
+            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
+      }
       if (primitive_util::IsIntegralType(to_type)) {
-        return ir_builder_->CreateIntCast(
+        return b_->CreateIntCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_),
             primitive_util::IsSignedIntegralType(from_type));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
         if (to_type == BF16) {
-          return EmitF32ToBF16(
-              EmitIntegralToFloating(operand_value, from_type, F32, module_,
-                                     ir_builder_),
-              ir_builder_);
+          return EmitF32ToBF16(EmitIntegralToFloating(operand_value, from_type,
+                                                      F32, module_, b_),
+                               b_);
         }
         return EmitIntegralToFloating(operand_value, from_type, to_type,
-                                      module_, ir_builder_);
+                                      module_, b_);
       }
       if (primitive_util::IsComplexType(to_type)) {
         auto to_ir_component_type = llvm_ir::PrimitiveTypeToIrType(
             primitive_util::ComplexComponentType(to_type), module_);
         if (primitive_util::IsSignedIntegralType(from_type)) {
           return EmitComposeComplex(
-              op,
-              ir_builder_->CreateSIToFP(operand_value, to_ir_component_type),
+              op, b_->CreateSIToFP(operand_value, to_ir_component_type),
               nullptr);
         }
         if (primitive_util::IsUnsignedIntegralType(from_type) ||
             from_type == PRED) {
           return EmitComposeComplex(
-              op,
-              ir_builder_->CreateUIToFP(operand_value, to_ir_component_type),
+              op, b_->CreateUIToFP(operand_value, to_ir_component_type),
               nullptr);
         }
       }
@@ -275,7 +275,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       }
       if (primitive_util::BitWidth(from_type) ==
           primitive_util::BitWidth(to_type)) {
-        return ir_builder_->CreateBitCast(
+        return b_->CreateBitCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return InvalidArgument(
@@ -293,18 +293,18 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
         auto type =
             llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
         auto zero = llvm::ConstantInt::get(type, 0);
-        auto cmp = ir_builder_->CreateICmpSGE(operand_value, zero);
-        return ir_builder_->CreateSelect(cmp, operand_value,
-                                         ir_builder_->CreateNeg(operand_value));
+        auto cmp = b_->CreateICmpSGE(operand_value, zero);
+        return b_->CreateSelect(cmp, operand_value,
+                                b_->CreateNeg(operand_value));
       } else {
         return operand_value;
       }
     }
     case HloOpcode::kClz: {
-      auto is_zero_undef = ir_builder_->getFalse();
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::ctlz, {operand_value, is_zero_undef},
-          {operand_value->getType()}, ir_builder_);
+      auto is_zero_undef = b_->getFalse();
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ctlz,
+                                          {operand_value, is_zero_undef},
+                                          {operand_value->getType()}, b_);
     }
     case HloOpcode::kSign: {
       bool is_signed =
@@ -312,31 +312,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerUnaryOp(
       auto type =
           llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
       auto zero = llvm::ConstantInt::get(type, 0);
-      auto cmp = ir_builder_->CreateICmpEQ(operand_value, zero);
+      auto cmp = b_->CreateICmpEQ(operand_value, zero);
       if (is_signed) {
-        auto ashr = ir_builder_->CreateAShr(operand_value,
-                                            type->getIntegerBitWidth() - 1);
-        return ir_builder_->CreateSelect(cmp, zero,
-                                         ir_builder_->CreateOr(ashr, 1));
+        auto ashr =
+            b_->CreateAShr(operand_value, type->getIntegerBitWidth() - 1);
+        return b_->CreateSelect(cmp, zero, b_->CreateOr(ashr, 1));
       } else {
-        return ir_builder_->CreateSelect(cmp, zero,
-                                         llvm::ConstantInt::get(type, 1));
+        return b_->CreateSelect(cmp, zero, llvm::ConstantInt::get(type, 1));
       }
     }
     case HloOpcode::kNegate:
-      return ir_builder_->CreateNeg(operand_value);
+      return b_->CreateNeg(operand_value);
     case HloOpcode::kNot: {
       auto type = op->shape().element_type();
       if (type == PRED) {
         // It is not sufficient to just call CreateNot() here because a PRED
         // is represented as an i8 and the truth value is stored only in the
         // bottom bit.
-        return ir_builder_->CreateZExt(
-            ir_builder_->CreateNot(ir_builder_->CreateTrunc(
-                operand_value, ir_builder_->getInt1Ty())),
+        return b_->CreateZExt(
+            b_->CreateNot(b_->CreateTrunc(operand_value, b_->getInt1Ty())),
             llvm_ir::PrimitiveTypeToIrType(PRED, module_));
       } else if (primitive_util::IsIntegralType(type)) {
-        return ir_builder_->CreateNot(operand_value);
+        return b_->CreateNot(operand_value);
       }
       return Unimplemented("unary op Not is not defined for type '%d'", type);
     }
@@ -352,7 +349,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       PrimitiveType to_type = op->shape().element_type();
-      CHECK(primitive_util::IsFloatingPointType(from_type));
+      CHECK(primitive_util::IsFloatingPointType(from_type)) << from_type;
       if (from_type == to_type) {
         return operand_value;
       }
@@ -364,32 +361,38 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
         }
         return EmitComposeComplex(
             op,
-            ir_builder_->CreateFPCast(
-                operand_value,
-                llvm_ir::PrimitiveTypeToIrType(to_component_type, module_)),
+            b_->CreateFPCast(operand_value, llvm_ir::PrimitiveTypeToIrType(
+                                                to_component_type, module_)),
             nullptr);
       }
       if (from_type == BF16) {
         TF_RET_CHECK(to_type != BF16);
-        operand_value = EmitBF16ToF32(operand_value, ir_builder_);
+        operand_value = EmitBF16ToF32(operand_value, b_);
         from_type = F32;
         if (from_type == to_type) {
           return operand_value;
         }
       }
       if (from_type == F32 && to_type == BF16) {
-        return EmitF32ToBF16(operand_value, ir_builder_);
+        return EmitF32ToBF16(operand_value, b_);
+      }
+      if (to_type == PRED) {
+        return b_->CreateZExt(
+            b_->CreateFCmpUNE(
+                operand_value,
+                llvm::ConstantFP::get(operand_value->getType(), 0.0)),
+            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
       }
       if (primitive_util::IsFloatingPointType(to_type)) {
-        return ir_builder_->CreateFPCast(
+        return b_->CreateFPCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       if (primitive_util::IsSignedIntegralType(to_type)) {
-        return ir_builder_->CreateFPToSI(
+        return b_->CreateFPToSI(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       if (primitive_util::IsUnsignedIntegralType(to_type)) {
-        return ir_builder_->CreateFPToUI(
+        return b_->CreateFPToUI(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return Unimplemented("unhandled conversion operation: %s => %s",
@@ -405,7 +408,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
       }
       if (primitive_util::BitWidth(from_type) ==
           primitive_util::BitWidth(to_type)) {
-        return ir_builder_->CreateBitCast(
+        return b_->CreateBitCast(
             operand_value, llvm_ir::PrimitiveTypeToIrType(to_type, module_));
       }
       return InvalidArgument(
@@ -429,45 +432,45 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     case HloOpcode::kSin:
       return EmitSin(op->shape().element_type(), operand_value);
     case HloOpcode::kFloor:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::floor, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::floor,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kCeil:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::ceil, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::ceil,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kAbs:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::fabs, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kRoundNearestAfz:
-      return llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::round, {operand_value}, {operand_value->getType()},
-          ir_builder_);
+      return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::round,
+                                          {operand_value},
+                                          {operand_value->getType()}, b_);
     case HloOpcode::kSign: {
       // TODO(b/32151903): Ensure consistent sign behavior for -0.0.
       auto type = operand_value->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(operand_value, zero);
-      auto olt = ir_builder_->CreateFCmpOLT(operand_value, zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(operand_value, zero);
+      auto olt = b_->CreateFCmpOLT(operand_value, zero);
+      return b_->CreateSelect(
           oeq, zero,
-          ir_builder_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
-                                    llvm::ConstantFP::get(type, 1.0)));
+          b_->CreateSelect(olt, llvm::ConstantFP::get(type, -1.0),
+                           llvm::ConstantFP::get(type, 1.0)));
     }
     case HloOpcode::kIsFinite: {
       // abs(x) o!= inf, this works because the comparison returns false if
       // either operand is NaN.
       auto type = operand_value->getType();
       auto abs_value = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::fabs, {operand_value}, {type}, ir_builder_);
+          llvm::Intrinsic::fabs, {operand_value}, {type}, b_);
       auto infinity = llvm::ConstantFP::getInfinity(type);
-      auto not_infinite = ir_builder_->CreateFCmpONE(abs_value, infinity);
-      return ir_builder_->CreateZExt(
-          not_infinite, llvm_ir::PrimitiveTypeToIrType(PRED, module_));
+      auto not_infinite = b_->CreateFCmpONE(abs_value, infinity);
+      return b_->CreateZExt(not_infinite,
+                            llvm_ir::PrimitiveTypeToIrType(PRED, module_));
     }
     case HloOpcode::kNegate:
-      return ir_builder_->CreateFNeg(operand_value);
+      return b_->CreateFNeg(operand_value);
     case HloOpcode::kReal:
       return operand_value;
     case HloOpcode::kImag:
@@ -491,13 +494,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto a = EmitExtractReal(operand_value);
       auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
-      auto sum_sq = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                            ir_builder_->CreateFMul(b, b));
+      auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b));
       TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
       TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a));
       auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+      return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq),
+                                angle);
     }
     case HloOpcode::kLog1p: {
       // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
@@ -505,15 +507,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       llvm::Type* llvm_ty = a->getType();
       auto one = llvm::ConstantFP::get(llvm_ty, 1.0);
-      auto a_plus_one = ir_builder_->CreateFAdd(a, one);
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(a_plus_one, a_plus_one),
-          ir_builder_->CreateFMul(b, b));
+      auto a_plus_one = b_->CreateFAdd(a, one);
+      auto sum_sq = b_->CreateFAdd(b_->CreateFMul(a_plus_one, a_plus_one),
+                                   b_->CreateFMul(b, b));
       TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
       TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one));
       auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+      return EmitComposeComplex(op, b_->CreateFMul(one_half, log_sum_sq),
+                                angle);
     }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
@@ -527,12 +528,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           primitive_util::ComplexComponentType(to_type);
       auto to_ir_component_type =
           llvm_ir::PrimitiveTypeToIrType(to_component_type, module_);
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFPCast(EmitExtractReal(operand_value),
-                                    to_ir_component_type),
-          ir_builder_->CreateFPCast(EmitExtractImag(operand_value),
-                                    to_ir_component_type));
+      return EmitComposeComplex(op,
+                                b_->CreateFPCast(EmitExtractReal(operand_value),
+                                                 to_ir_component_type),
+                                b_->CreateFPCast(EmitExtractImag(operand_value),
+                                                 to_ir_component_type));
     }
     case HloOpcode::kExp: {
       // e^(a+bi) = e^a*(cos(b)+sin(b)i)
@@ -542,8 +542,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
           auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
       TF_ASSIGN_OR_RETURN(
           auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
-                                ir_builder_->CreateFMul(exp_a, sin_b));
+      return EmitComposeComplex(op, b_->CreateFMul(exp_a, cos_b),
+                                b_->CreateFMul(exp_a, sin_b));
     }
     case HloOpcode::kExpm1: {
       // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
@@ -554,9 +554,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       TF_ASSIGN_OR_RETURN(
           auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
       auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
-      auto real_result =
-          ir_builder_->CreateFSub(ir_builder_->CreateFMul(exp_a, cos_b), one);
-      auto imag_result = ir_builder_->CreateFMul(exp_a, sin_b);
+      auto real_result = b_->CreateFSub(b_->CreateFMul(exp_a, cos_b), one);
+      auto imag_result = b_->CreateFMul(exp_a, sin_b);
       return EmitComposeComplex(op, real_result, imag_result);
     }
     case HloOpcode::kCos: {
@@ -571,18 +570,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
+      auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
+          b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
       TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
       TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFAdd(half_exp_neg_b, half_exp_b)),
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFSub(half_exp_neg_b, half_exp_b)));
+          op, b_->CreateFMul(cos_a, b_->CreateFAdd(half_exp_neg_b, half_exp_b)),
+          b_->CreateFMul(sin_a, b_->CreateFSub(half_exp_neg_b, half_exp_b)));
     }
     case HloOpcode::kSin: {
       // sin(z) = .5i(e^(-iz) - e^(iz))
@@ -598,18 +593,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       auto b = EmitExtractImag(operand_value);
       auto type = a->getType();
       TF_ASSIGN_OR_RETURN(auto exp_b, EmitExp(component_type, b));
-      auto half_exp_b =
-          ir_builder_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
+      auto half_exp_b = b_->CreateFMul(llvm::ConstantFP::get(type, 0.5), exp_b);
       auto half_exp_neg_b =
-          ir_builder_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
+          b_->CreateFDiv(llvm::ConstantFP::get(type, 0.5), exp_b);
       TF_ASSIGN_OR_RETURN(auto cos_a, EmitCos(component_type, a));
       TF_ASSIGN_OR_RETURN(auto sin_a, EmitSin(component_type, a));
       return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFMul(
-              sin_a, ir_builder_->CreateFAdd(half_exp_b, half_exp_neg_b)),
-          ir_builder_->CreateFMul(
-              cos_a, ir_builder_->CreateFSub(half_exp_b, half_exp_neg_b)));
+          op, b_->CreateFMul(sin_a, b_->CreateFAdd(half_exp_b, half_exp_neg_b)),
+          b_->CreateFMul(cos_a, b_->CreateFSub(half_exp_b, half_exp_neg_b)));
     }
     case HloOpcode::kTanh: {
       /*
@@ -637,64 +628,61 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       TF_ASSIGN_OR_RETURN(auto exp_a, EmitExp(component_type, a));
       TF_ASSIGN_OR_RETURN(auto cos_b, EmitCos(component_type, b));
       TF_ASSIGN_OR_RETURN(auto sin_b, EmitSin(component_type, b));
-      auto exp_neg_a = ir_builder_->CreateFDiv(
-          llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
-      auto exp_2a_minus_exp_neg_2a = ir_builder_->CreateFSub(
-          ir_builder_->CreateFMul(exp_a, exp_a),
-          ir_builder_->CreateFMul(exp_neg_a, exp_neg_a));
-      auto cos_b_sq = ir_builder_->CreateFMul(cos_b, cos_b);
-      auto sin_b_sq = ir_builder_->CreateFMul(sin_b, sin_b);
-      auto real_num = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
-          ir_builder_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
-      auto cos_b_sin_b = ir_builder_->CreateFMul(cos_b, sin_b);
-      auto exp_a_plus_exp_neg_a = ir_builder_->CreateFAdd(exp_a, exp_neg_a);
+      auto exp_neg_a =
+          b_->CreateFDiv(llvm::ConstantFP::get(exp_a->getType(), 1), exp_a);
+      auto exp_2a_minus_exp_neg_2a = b_->CreateFSub(
+          b_->CreateFMul(exp_a, exp_a), b_->CreateFMul(exp_neg_a, exp_neg_a));
+      auto cos_b_sq = b_->CreateFMul(cos_b, cos_b);
+      auto sin_b_sq = b_->CreateFMul(sin_b, sin_b);
+      auto real_num =
+          b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_2a_minus_exp_neg_2a),
+                         b_->CreateFMul(sin_b_sq, exp_2a_minus_exp_neg_2a));
+      auto cos_b_sin_b = b_->CreateFMul(cos_b, sin_b);
+      auto exp_a_plus_exp_neg_a = b_->CreateFAdd(exp_a, exp_neg_a);
       auto exp_a_plus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
-      auto exp_a_minus_exp_neg_a = ir_builder_->CreateFSub(exp_a, exp_neg_a);
+          b_->CreateFMul(exp_a_plus_exp_neg_a, exp_a_plus_exp_neg_a);
+      auto exp_a_minus_exp_neg_a = b_->CreateFSub(exp_a, exp_neg_a);
       auto exp_a_minus_exp_neg_a_sq =
-          ir_builder_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
-      auto imag_num = ir_builder_->CreateFMul(
-          cos_b_sin_b, ir_builder_->CreateFSub(exp_a_plus_exp_neg_a_sq,
-                                               exp_a_minus_exp_neg_a_sq));
-      auto denom = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
-          ir_builder_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
-      return EmitComposeComplex(op, ir_builder_->CreateFDiv(real_num, denom),
-                                ir_builder_->CreateFDiv(imag_num, denom));
+          b_->CreateFMul(exp_a_minus_exp_neg_a, exp_a_minus_exp_neg_a);
+      auto imag_num = b_->CreateFMul(
+          cos_b_sin_b,
+          b_->CreateFSub(exp_a_plus_exp_neg_a_sq, exp_a_minus_exp_neg_a_sq));
+      auto denom =
+          b_->CreateFAdd(b_->CreateFMul(cos_b_sq, exp_a_plus_exp_neg_a_sq),
+                         b_->CreateFMul(sin_b_sq, exp_a_minus_exp_neg_a_sq));
+      return EmitComposeComplex(op, b_->CreateFDiv(real_num, denom),
+                                b_->CreateFDiv(imag_num, denom));
     }
     case HloOpcode::kAbs: {
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
-                                  EmitExtractReal(operand_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
-                                  EmitExtractImag(operand_value)));
+      auto sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value),
+                                        EmitExtractReal(operand_value)),
+                         b_->CreateFMul(EmitExtractImag(operand_value),
+                                        EmitExtractImag(operand_value)));
       return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sqrt, {sum_sq},
-                                          {sum_sq->getType()}, ir_builder_);
+                                          {sum_sq->getType()}, b_);
     }
     case HloOpcode::kSign: {  // Sign(c) = c / |c|
-      auto sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(operand_value),
-                                  EmitExtractReal(operand_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(operand_value),
-                                  EmitExtractImag(operand_value)));
+      auto sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(operand_value),
+                                        EmitExtractReal(operand_value)),
+                         b_->CreateFMul(EmitExtractImag(operand_value),
+                                        EmitExtractImag(operand_value)));
       auto cplx_abs = llvm_ir::EmitCallToIntrinsic(
-          llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, ir_builder_);
+          llvm::Intrinsic::sqrt, {sum_sq}, {sum_sq->getType()}, b_);
       auto type = cplx_abs->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(cplx_abs, zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(cplx_abs, zero);
+      return b_->CreateSelect(
           oeq, EmitComposeComplex(op, zero, zero),
           EmitComposeComplex(
-              op,
-              ir_builder_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
-              ir_builder_->CreateFDiv(EmitExtractImag(operand_value),
-                                      cplx_abs)));
+              op, b_->CreateFDiv(EmitExtractReal(operand_value), cplx_abs),
+              b_->CreateFDiv(EmitExtractImag(operand_value), cplx_abs)));
     }
     case HloOpcode::kNegate:
-      return EmitComposeComplex(
-          op, ir_builder_->CreateFNeg(EmitExtractReal(operand_value)),
-          ir_builder_->CreateFNeg(EmitExtractImag(operand_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFNeg(EmitExtractReal(operand_value)),
+                                b_->CreateFNeg(EmitExtractImag(operand_value)));
     case HloOpcode::kReal:
       return EmitExtractReal(operand_value);
     case HloOpcode::kImag:
@@ -728,15 +716,15 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     case HloOpcode::kComplex:
       return EmitComposeComplex(op, lhs_value, rhs_value);
     case HloOpcode::kAdd:
-      return ir_builder_->CreateFAdd(lhs_value, rhs_value);
+      return b_->CreateFAdd(lhs_value, rhs_value);
     case HloOpcode::kSubtract:
-      return ir_builder_->CreateFSub(lhs_value, rhs_value);
+      return b_->CreateFSub(lhs_value, rhs_value);
     case HloOpcode::kMultiply:
-      return ir_builder_->CreateFMul(lhs_value, rhs_value);
+      return b_->CreateFMul(lhs_value, rhs_value);
     case HloOpcode::kDivide:
-      return ir_builder_->CreateFDiv(lhs_value, rhs_value);
+      return b_->CreateFDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
-      return ir_builder_->CreateFRem(lhs_value, rhs_value);
+      return b_->CreateFRem(lhs_value, rhs_value);
     // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
     // comparisons always return false when one of the operands is NaN, whereas
     // unordered comparisons return true.
@@ -746,22 +734,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatBinaryOp(
     // matches C++'s semantics.
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kNe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLT, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kGt:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGT, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OLE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kGe:
       return llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OGE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
 
     case HloOpcode::kMaximum:
       return EmitFloatMax(lhs_value, rhs_value);
@@ -782,64 +770,56 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     llvm::Value* rhs_value) const {
   switch (op->opcode()) {
     case HloOpcode::kAdd:
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFAdd(EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFAdd(EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFAdd(EmitExtractReal(lhs_value),
+                                               EmitExtractReal(rhs_value)),
+                                b_->CreateFAdd(EmitExtractImag(lhs_value),
+                                               EmitExtractImag(rhs_value)));
     case HloOpcode::kSubtract:
-      return EmitComposeComplex(
-          op,
-          ir_builder_->CreateFSub(EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFSub(EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value)));
+      return EmitComposeComplex(op,
+                                b_->CreateFSub(EmitExtractReal(lhs_value),
+                                               EmitExtractReal(rhs_value)),
+                                b_->CreateFSub(EmitExtractImag(lhs_value),
+                                               EmitExtractImag(rhs_value)));
     case HloOpcode::kMultiply:
       return EmitComposeComplex(
           op,
-          ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractReal(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractImag(rhs_value))),
-          ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractImag(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractReal(rhs_value))));
+          b_->CreateFSub(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                        EmitExtractReal(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(lhs_value),
+                                        EmitExtractImag(rhs_value))),
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                        EmitExtractImag(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(lhs_value),
+                                        EmitExtractReal(rhs_value))));
     case HloOpcode::kDivide: {
       // (a+bi) / (c+di) = ((a+bi)(c-di)) / ((c+di)(c-di))
       // = ((ac + bd) + (bc - ad)i) / (c^2 + d^2)
-      auto rhs_sum_sq = ir_builder_->CreateFAdd(
-          ir_builder_->CreateFMul(EmitExtractReal(rhs_value),
-                                  EmitExtractReal(rhs_value)),
-          ir_builder_->CreateFMul(EmitExtractImag(rhs_value),
-                                  EmitExtractImag(rhs_value)));
+      auto rhs_sum_sq =
+          b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(rhs_value),
+                                        EmitExtractReal(rhs_value)),
+                         b_->CreateFMul(EmitExtractImag(rhs_value),
+                                        EmitExtractImag(rhs_value)));
       auto type = rhs_sum_sq->getType();
       auto zero = llvm::ConstantFP::get(type, 0.0);
-      auto oeq = ir_builder_->CreateFCmpOEQ(rhs_sum_sq, zero);
-      auto real_inf_or_nan =
-          ir_builder_->CreateFDiv(EmitExtractReal(lhs_value), zero);
-      auto imag_inf_or_nan =
-          ir_builder_->CreateFDiv(EmitExtractImag(lhs_value), zero);
-      return ir_builder_->CreateSelect(
+      auto oeq = b_->CreateFCmpOEQ(rhs_sum_sq, zero);
+      auto real_inf_or_nan = b_->CreateFDiv(EmitExtractReal(lhs_value), zero);
+      auto imag_inf_or_nan = b_->CreateFDiv(EmitExtractImag(lhs_value), zero);
+      return b_->CreateSelect(
           oeq, EmitComposeComplex(op, real_inf_or_nan, imag_inf_or_nan),
           EmitComposeComplex(
               op,
-              ir_builder_->CreateFDiv(
-                  ir_builder_->CreateFAdd(
-                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                              EmitExtractReal(rhs_value)),
-                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                              EmitExtractImag(rhs_value))),
+              b_->CreateFDiv(
+                  b_->CreateFAdd(b_->CreateFMul(EmitExtractReal(lhs_value),
+                                                EmitExtractReal(rhs_value)),
+                                 b_->CreateFMul(EmitExtractImag(lhs_value),
+                                                EmitExtractImag(rhs_value))),
                   rhs_sum_sq),
-              ir_builder_->CreateFDiv(
-                  ir_builder_->CreateFSub(
-                      ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                              EmitExtractReal(rhs_value)),
-                      ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                              EmitExtractImag(rhs_value))),
+              b_->CreateFDiv(
+                  b_->CreateFSub(b_->CreateFMul(EmitExtractImag(lhs_value),
+                                                EmitExtractReal(rhs_value)),
+                                 b_->CreateFMul(EmitExtractReal(lhs_value),
+                                                EmitExtractImag(rhs_value))),
                   rhs_sum_sq)));
     }
     // LLVM comparisons can be "unordered" (U) or "ordered" (O) -- ordered
@@ -850,21 +830,21 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
     // unordered comparison.  This makes x != y equivalent to !(x == y), and
     // matches C++'s semantics.
     case HloOpcode::kEq:
-      return ir_builder_->CreateAnd(
+      return b_->CreateAnd(
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
                                   EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value), ir_builder_),
+                                  EmitExtractReal(rhs_value), b_),
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_OEQ,
                                   EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value), ir_builder_));
+                                  EmitExtractImag(rhs_value), b_));
     case HloOpcode::kNe:
-      return ir_builder_->CreateOr(
+      return b_->CreateOr(
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
                                   EmitExtractReal(lhs_value),
-                                  EmitExtractReal(rhs_value), ir_builder_),
+                                  EmitExtractReal(rhs_value), b_),
           llvm_ir::EmitComparison(llvm::CmpInst::FCMP_UNE,
                                   EmitExtractImag(lhs_value),
-                                  EmitExtractImag(rhs_value), ir_builder_));
+                                  EmitExtractImag(rhs_value), b_));
 
     case HloOpcode::kPower: {
       // (a+bi)^(c+di) =
@@ -876,29 +856,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
       auto b = EmitExtractImag(lhs_value);
       auto c = EmitExtractReal(rhs_value);
       auto d = EmitExtractImag(rhs_value);
-      auto aa_p_bb = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(a, a),
-                                             ir_builder_->CreateFMul(b, b));
+      auto aa_p_bb = b_->CreateFAdd(b_->CreateFMul(a, a), b_->CreateFMul(b, b));
       auto one_half = llvm::ConstantFP::get(a->getType(), 0.5);
-      auto half_c = ir_builder_->CreateFMul(one_half, c);
+      auto half_c = b_->CreateFMul(one_half, c);
 
       TF_ASSIGN_OR_RETURN(auto aa_p_bb_to_half_c,
                           EmitPow(component_type, aa_p_bb, half_c));
-      auto neg_d = ir_builder_->CreateFNeg(d);
+      auto neg_d = b_->CreateFNeg(d);
       TF_ASSIGN_OR_RETURN(auto arg_lhs, EmitAtan2(component_type, b, a));
-      auto neg_d_arg_lhs = ir_builder_->CreateFMul(neg_d, arg_lhs);
+      auto neg_d_arg_lhs = b_->CreateFMul(neg_d, arg_lhs);
       TF_ASSIGN_OR_RETURN(auto e_to_neg_d_arg_lhs,
                           EmitExp(component_type, neg_d_arg_lhs));
-      auto coeff =
-          ir_builder_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
+      auto coeff = b_->CreateFMul(aa_p_bb_to_half_c, e_to_neg_d_arg_lhs);
       TF_ASSIGN_OR_RETURN(auto ln_aa_p_bb, EmitLog(component_type, aa_p_bb));
-      auto half_d = ir_builder_->CreateFMul(one_half, d);
-      auto q =
-          ir_builder_->CreateFAdd(ir_builder_->CreateFMul(c, arg_lhs),
-                                  ir_builder_->CreateFMul(half_d, ln_aa_p_bb));
+      auto half_d = b_->CreateFMul(one_half, d);
+      auto q = b_->CreateFAdd(b_->CreateFMul(c, arg_lhs),
+                              b_->CreateFMul(half_d, ln_aa_p_bb));
       TF_ASSIGN_OR_RETURN(auto cos_q, EmitCos(component_type, q));
       TF_ASSIGN_OR_RETURN(auto sin_q, EmitSin(component_type, q));
-      return EmitComposeComplex(op, ir_builder_->CreateFMul(coeff, cos_q),
-                                ir_builder_->CreateFMul(coeff, sin_q));
+      return EmitComposeComplex(op, b_->CreateFMul(coeff, cos_q),
+                                b_->CreateFMul(coeff, sin_q));
     }
     default:
       return Unimplemented("binary complex op '%s'",
@@ -908,12 +885,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexBinaryOp(
 
 llvm::Value* ElementalIrEmitter::EmitFloatMax(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) const {
-  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, ir_builder_);
+  return llvm_ir::EmitFloatMax(lhs_value, rhs_value, b_);
 }
 
 llvm::Value* ElementalIrEmitter::EmitFloatMin(llvm::Value* lhs_value,
                                               llvm::Value* rhs_value) const {
-  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, ir_builder_);
+  return llvm_ir::EmitFloatMin(lhs_value, rhs_value, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
@@ -925,15 +902,14 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
         "type F32.");
   }
   auto getFloat = [&](const float f) {
-    return llvm::ConstantFP::get(ir_builder_->getFloatTy(), f);
+    return llvm::ConstantFP::get(b_->getFloatTy(), f);
   };
   auto multiply_add = [&](tensorflow::gtl::ArraySlice<float> coefficients,
                           llvm::Value* w) {
     llvm::Value* p = getFloat(coefficients.front());
     coefficients.pop_front();
     for (float coefficient : coefficients) {
-      p = ir_builder_->CreateFAdd(ir_builder_->CreateFMul(p, w),
-                                  getFloat(coefficient));
+      p = b_->CreateFAdd(b_->CreateFMul(p, w), getFloat(coefficient));
     }
     return p;
   };
@@ -951,50 +927,48 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfInv(PrimitiveType prim_type,
   //   }
   //   return p*x
   llvm::Function* logf_fn = llvm::Intrinsic::getDeclaration(
-      module_, llvm::Intrinsic::log, {ir_builder_->getFloatTy()});
+      module_, llvm::Intrinsic::log, {b_->getFloatTy()});
 
-  llvm::Value* w = ir_builder_->CreateFNeg(ir_builder_->CreateCall(
-      logf_fn,
-      {ir_builder_->CreateFMul(ir_builder_->CreateFSub(getFloat(1.0f), x),
-                               ir_builder_->CreateFAdd(getFloat(1.0f), x))}));
+  llvm::Value* w = b_->CreateFNeg(b_->CreateCall(
+      logf_fn, {b_->CreateFMul(b_->CreateFSub(getFloat(1.0f), x),
+                               b_->CreateFAdd(getFloat(1.0f), x))}));
 
-  llvm::Value* p_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-      ir_builder_->getFloatTy(), "p.addr", ir_builder_);
+  llvm::Value* p_addr =
+      llvm_ir::EmitAllocaAtFunctionEntry(b_->getFloatTy(), "p.addr", b_);
 
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(ir_builder_->CreateFCmpOLT(w, getFloat(5.0f)),
-                              "w_less_than_five", ir_builder_);
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      b_->CreateFCmpOLT(w, getFloat(5.0f)), "w_less_than_five", b_);
   // Handle true BB.
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   {
-    llvm::Value* lw = ir_builder_->CreateFSub(w, getFloat(2.5f));
+    llvm::Value* lw = b_->CreateFSub(w, getFloat(2.5f));
     tensorflow::gtl::ArraySlice<float> lq{
         2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
         -4.39150654e-06f, 0.00021858087f,  -0.00125372503f,
         -0.00417768164f,  0.246640727f,    1.50140941f};
     llvm::Value* p = multiply_add(lq, lw);
-    ir_builder_->CreateStore(p, p_addr);
+    b_->CreateStore(p, p_addr);
   }
 
   // Handle false BB.
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   {
     llvm::Function* sqrtf_fn = llvm::Intrinsic::getDeclaration(
-        module_, llvm::Intrinsic::sqrt, {ir_builder_->getFloatTy()});
+        module_, llvm::Intrinsic::sqrt, {b_->getFloatTy()});
 
-    llvm::Value* gw = ir_builder_->CreateFSub(
-        ir_builder_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f));
+    llvm::Value* gw =
+        b_->CreateFSub(b_->CreateCall(sqrtf_fn, {w}), getFloat(3.0f));
     tensorflow::gtl::ArraySlice<float> gq{
         -0.000200214257f, 0.000100950558f, 0.00134934322f,
         -0.00367342844f,  0.00573950773f,  -0.0076224613f,
         0.00943887047f,   1.00167406f,     2.83297682f};
     llvm::Value* p = multiply_add(gq, gw);
-    ir_builder_->CreateStore(p, p_addr);
+    b_->CreateStore(p, p_addr);
   }
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-  llvm::Value* p = ir_builder_->CreateLoad(p_addr);
-  return ir_builder_->CreateFMul(p, x);
+  SetToFirstInsertPoint(if_data.after_block, b_);
+  llvm::Value* p = b_->CreateLoad(p_addr);
+  return b_->CreateFMul(p, x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
@@ -1002,13 +976,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitErfcInv(
   // Compute erfcinv(value) by calculating erfinv(1.0 - value).
   auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
   auto one = llvm::ConstantFP::get(type, 1.0);
-  return EmitErfInv(prim_type, ir_builder_->CreateFSub(one, value));
+  return EmitErfInv(prim_type, b_->CreateFSub(one, value));
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::log, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
@@ -1020,35 +994,34 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
   // When x is large, the naive evaluation of ln(x + 1) is more
   // accurate than the Taylor series.
   TF_ASSIGN_OR_RETURN(auto for_large_x,
-                      EmitLog(prim_type, ir_builder_->CreateFAdd(x, one)));
+                      EmitLog(prim_type, b_->CreateFAdd(x, one)));
   // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + ….
-  auto for_small_x = ir_builder_->CreateFMul(
-      ir_builder_->CreateFAdd(ir_builder_->CreateFMul(negative_half, x), one),
-      x);
+  auto for_small_x =
+      b_->CreateFMul(b_->CreateFAdd(b_->CreateFMul(negative_half, x), one), x);
   const auto kAntilogarithmIsSmallThreshold = 1e-4;
-  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
-                                            {type}, ir_builder_);
-  auto x_is_small = ir_builder_->CreateFCmpOLT(
+  auto abs_x =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
+  auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold));
-  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitCos(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::cos, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::exp, {value},
-                                      {value->getType()}, ir_builder_);
+                                      {value->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
@@ -1060,25 +1033,25 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
   // When the exponent is large, the naive evaluation of e^(x) - 1 is more
   // accurate than the Taylor series.
   TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value));
-  auto for_large_x = ir_builder_->CreateFSub(exp_x, one);
+  auto for_large_x = b_->CreateFSub(exp_x, one);
   // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + ….
   // We want exp(x)-1 which is x + x^2/2 + x^3/6 + ….
-  auto x_squared = ir_builder_->CreateFAdd(x, x);
-  auto x_squared_over_two = ir_builder_->CreateFMul(x_squared, half);
-  auto for_small_x = ir_builder_->CreateFAdd(x, x_squared_over_two);
+  auto x_squared = b_->CreateFAdd(x, x);
+  auto x_squared_over_two = b_->CreateFMul(x_squared, half);
+  auto for_small_x = b_->CreateFAdd(x, x_squared_over_two);
   const auto kExponentIsSmallThreshold = 1e-5;
-  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
-                                            {type}, ir_builder_);
-  auto x_is_small = ir_builder_->CreateFCmpOLT(
+  auto abs_x =
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value}, {type}, b_);
+  auto x_is_small = b_->CreateFCmpOLT(
       abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
-  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+  return b_->CreateSelect(x_is_small, for_small_x, for_large_x);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::pow, {lhs, rhs},
-                                      {lhs->getType()}, ir_builder_);
+                                      {lhs->getType()}, b_);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
@@ -1093,11 +1066,10 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitReducePrecision(
     return Unimplemented("reduce-precision only implemented for F32");
   }
   return EmitReducePrecisionFloat(x, /*exponent_bits=*/hlo->exponent_bits(),
-                                  /*mantissa_bits=*/hlo->mantissa_bits(),
-                                  ir_builder_);
+                                  /*mantissa_bits=*/hlo->mantissa_bits(), b_);
 }
 
-static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder,
+static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* b,
                                              llvm::Value* lhs, llvm::Value* rhs,
                                              llvm::Value* shift_result,
                                              bool saturate_to_sign_bit) {
@@ -1110,15 +1082,14 @@ static llvm::Value* SaturateShiftIfNecessary(llvm::IRBuilder<>* ir_builder,
   llvm::ConstantInt* minus_one = llvm::ConstantInt::get(integer_type, -1);
   llvm::Value* saturated_value;
   if (saturate_to_sign_bit) {
-    saturated_value = ir_builder->CreateSelect(
-        ir_builder->CreateICmpSLT(lhs, zero), minus_one, zero);
+    saturated_value =
+        b->CreateSelect(b->CreateICmpSLT(lhs, zero), minus_one, zero);
   } else {
     saturated_value = zero;
   }
   llvm::Value* shift_amt_in_range =
-      ir_builder->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk");
-  return ir_builder->CreateSelect(shift_amt_in_range, shift_result,
-                                  saturated_value);
+      b->CreateICmpULT(rhs, integer_bitsize_constant, "shft.chk");
+  return b->CreateSelect(shift_amt_in_range, shift_result, saturated_value);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
@@ -1127,49 +1098,49 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
   switch (op->opcode()) {
     // TODO(jingyue): add the "nsw" attribute for signed types.
     case HloOpcode::kAdd:
-      return ir_builder_->CreateAdd(lhs_value, rhs_value);
+      return b_->CreateAdd(lhs_value, rhs_value);
     case HloOpcode::kSubtract:
-      return ir_builder_->CreateSub(lhs_value, rhs_value);
+      return b_->CreateSub(lhs_value, rhs_value);
     case HloOpcode::kMultiply:
-      return ir_builder_->CreateMul(lhs_value, rhs_value);
+      return b_->CreateMul(lhs_value, rhs_value);
     case HloOpcode::kDivide:
-      return is_signed ? ir_builder_->CreateSDiv(lhs_value, rhs_value)
-                       : ir_builder_->CreateUDiv(lhs_value, rhs_value);
+      return is_signed ? b_->CreateSDiv(lhs_value, rhs_value)
+                       : b_->CreateUDiv(lhs_value, rhs_value);
     case HloOpcode::kRemainder:
-      return is_signed ? ir_builder_->CreateSRem(lhs_value, rhs_value)
-                       : ir_builder_->CreateURem(lhs_value, rhs_value);
+      return is_signed ? b_->CreateSRem(lhs_value, rhs_value)
+                       : b_->CreateURem(lhs_value, rhs_value);
     case HloOpcode::kEq:
       return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_EQ, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kNe:
       return llvm_ir::EmitComparison(llvm::CmpInst::ICMP_NE, lhs_value,
-                                     rhs_value, ir_builder_);
+                                     rhs_value, b_);
     case HloOpcode::kLt:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SLT : llvm::CmpInst::ICMP_ULT,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kGt:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SGT : llvm::CmpInst::ICMP_UGT,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kLe:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SLE : llvm::CmpInst::ICMP_ULE,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kGe:
       return llvm_ir::EmitComparison(
           is_signed ? llvm::CmpInst::ICMP_SGE : llvm::CmpInst::ICMP_UGE,
-          lhs_value, rhs_value, ir_builder_);
+          lhs_value, rhs_value, b_);
     case HloOpcode::kMinimum:
       return EmitIntegralMin(lhs_value, rhs_value, is_signed);
     case HloOpcode::kMaximum:
       return EmitIntegralMax(lhs_value, rhs_value, is_signed);
     case HloOpcode::kAnd:
-      return ir_builder_->CreateAnd(lhs_value, rhs_value);
+      return b_->CreateAnd(lhs_value, rhs_value);
     case HloOpcode::kOr:
-      return ir_builder_->CreateOr(lhs_value, rhs_value);
+      return b_->CreateOr(lhs_value, rhs_value);
     case HloOpcode::kXor:
-      return ir_builder_->CreateXor(lhs_value, rhs_value);
+      return b_->CreateXor(lhs_value, rhs_value);
 
     // Shifting out bits >= the number of bits in the type being shifted
     // produces a poison value in LLVM which is basically "deferred undefined
@@ -1177,20 +1148,17 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
     // UB.  We replace the poison value with a constant to avoid this deferred
     // UB.
     case HloOpcode::kShiftRightArithmetic:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateAShr(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/true);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateAShr(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/true);
     case HloOpcode::kShiftLeft:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateShl(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/false);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateShl(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/false);
     case HloOpcode::kShiftRightLogical:
-      return SaturateShiftIfNecessary(
-          ir_builder_, lhs_value, rhs_value,
-          ir_builder_->CreateLShr(lhs_value, rhs_value),
-          /*saturate_to_sign_bit=*/false);
+      return SaturateShiftIfNecessary(b_, lhs_value, rhs_value,
+                                      b_->CreateLShr(lhs_value, rhs_value),
+                                      /*saturate_to_sign_bit=*/false);
     default:
       return Unimplemented("binary integer op '%s'",
                            HloOpcodeString(op->opcode()).c_str());
@@ -1200,21 +1168,19 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitIntegerBinaryOp(
 llvm::Value* ElementalIrEmitter::EmitIntegralMax(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateICmp(
-          is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-          lhs_value, rhs_value),
-      lhs_value, rhs_value);
+  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+                                                   : llvm::ICmpInst::ICMP_UGE,
+                                         lhs_value, rhs_value),
+                          lhs_value, rhs_value);
 }
 
 llvm::Value* ElementalIrEmitter::EmitIntegralMin(llvm::Value* lhs_value,
                                                  llvm::Value* rhs_value,
                                                  bool is_signed) const {
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateICmp(
-          is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-          lhs_value, rhs_value),
-      lhs_value, rhs_value);
+  return b_->CreateSelect(b_->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+                                                   : llvm::ICmpInst::ICMP_ULE,
+                                         lhs_value, rhs_value),
+                          lhs_value, rhs_value);
 }
 
 llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
@@ -1257,180 +1223,254 @@ llvm_ir::IrArray::Index ElementalIrEmitter::ElementwiseSourceIndex(
   return source_index;
 }
 
-llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
+StatusOr<llvm::Value*> ElementalIrEmitter::ConvertValueForDistribution(
     const HloInstruction* hlo,
-    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
-    const {
-  PrimitiveType param_prim_type = hlo->operand(0)->shape().element_type();
-  llvm::Type* param_ir_type =
-      llvm_ir::PrimitiveTypeToIrType(param_prim_type, module_);
-
-  // Same values as PCG library
-  // https://github.com/imneme/pcg-c/blob/master/include/pcg_variants.h
-  llvm::Value* multiplier = ir_builder_->getInt(
-      llvm::APInt(128, {0x4385DF649FCCF645, 0x2360ED051FC65DA4}));
-  llvm::Value* increment = ir_builder_->getInt(
-      llvm::APInt(128, {0x14057B7EF767814F, 0x5851F42D4C957F2D}));
-
-  auto random_value_from_hlo = [hlo]() {
-    const HloModule* module =
-        hlo->IsFused() ? hlo->parent()->FusionInstruction()->parent()->parent()
-                       : hlo->parent()->parent();
-    return module->RandomNew64();
-  };
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * a_or_mean,
+                      operand_to_generator.at(hlo->operand(0))(index));
+  TF_ASSIGN_OR_RETURN(llvm::Value * b_or_sigma,
+                      operand_to_generator.at(hlo->operand(1))(index));
+  PrimitiveType elem_prim_ty = hlo->shape().element_type();
+  llvm::Type* elem_ir_ty =
+      llvm_ir::PrimitiveTypeToIrType(elem_prim_ty, module_);
+  llvm::Type* raw_value_ty = raw_value->getType();
+
+  // Convert raw integer to float in range [0, 1) if the element is a float.
+  llvm::Value* elem_value = raw_value;
+  if (elem_ir_ty->isFloatingPointTy()) {
+    elem_value = b_->CreateUIToFP(elem_value, elem_ir_ty);
+    unsigned raw_value_size_in_bits = raw_value_ty->getPrimitiveSizeInBits();
+    CHECK(raw_value_size_in_bits == 32 || raw_value_size_in_bits == 64);
+    elem_value = b_->CreateFDiv(
+        elem_value,
+        llvm::ConstantFP::get(elem_ir_ty,
+                              raw_value_size_in_bits == 64 ? 0x1p64 : 0x1p32));
+  }
+
+  // Convert the value for the requested distribution.
+  switch (hlo->random_distribution()) {
+    case RNG_UNIFORM: {
+      if (elem_ir_ty->isFloatingPointTy()) {
+        return b_->CreateFAdd(
+            b_->CreateFMul(b_->CreateFSub(b_or_sigma, a_or_mean), elem_value),
+            a_or_mean);
+      } else {
+        // To generate a uniform random value in [a, b) from a raw random sample
+        // in range [0, 2^N), we let range = b - a and return
+        // (a + raw_value % range). If range is not a power of 2, raw values
+        // larger than (2^N - 2^N % range) are biased toward results in
+        // [a, a + (limit % range)). An unbiased algorithm would need to drop
+        // raw values and re-sample, but we don't do this because re-sampling in
+        // an efficient way is complex, and it's not clear that users need it.
+        // In particular, if one thread in a GPU warp needs to re-sample, we pay
+        // the same cost as if the whole warp were to re-sample.  So an
+        // efficient re-sampling implementation on GPU would need to do
+        // nontrivial work to share entropy between threads in the warp.
+        auto range = b_->CreateSub(b_or_sigma, a_or_mean);
+        return b_->CreateAdd(a_or_mean, b_->CreateURem(elem_value, range));
+      }
+    }
+    case RNG_NORMAL: {
+      TF_ASSIGN_OR_RETURN(
+          llvm::Value * r,
+          EmitErfcInv(elem_prim_ty,
+                      b_->CreateFMul(llvm::ConstantFP::get(elem_ir_ty, 2.0),
+                                     elem_value)));
+      return b_->CreateFAdd(b_->CreateFMul(r, b_or_sigma), a_or_mean);
+    }
+    default:
+      return InvalidArgument(
+          "unhandled distribution %s",
+          RandomDistribution_Name(hlo->random_distribution()).c_str());
+  }
+}
+
+namespace {
+
+// Checks that the primitive type is supported by the elemental IR emitter for
+// Philox RNG and returns the number of elements in each 128 bit sample of the
+// Philox RNG algorithm.
+int32 GetNumberOfElementsPerPhiloxRngSample(PrimitiveType elem_prim_ty) {
+  // Calculate the number of elements, that is the number of random numbers, in
+  // a 128 bit sample.
+  switch (elem_prim_ty) {
+    case U32:
+    case S32:
+    case F32:
+    // The algorithm uses 32 bits to generate values for F16.
+    case F16:
+      return 4;
+    case U64:
+    case F64:
+      return 2;
+    default:
+      // BF16 is converted to F16 by the hlo pass HloElementTypeConverter.
+      // Other data types are not supported by XLA random operation.
+      LOG(FATAL) << "Unrecognized primitive type for RNG " << elem_prim_ty;
+  }
+  return 0;
+}
+
+// Calculates the four uint32 values for the 128-bit Philox sample.
+std::array<llvm::Value*, 4> CalculateSampleValues(
+    llvm::Value* sample_idx, llvm::Value* hlo_random_value,
+    llvm::Value* global_random_number, llvm::Value* rng_state,
+    llvm::IRBuilder<>* b) {
+  llvm::Type* index_ty = sample_idx->getType();
+
+  std::array<llvm::Value*, 4> counter_values;
+
+  // Use the sample index to initialize counter[0] and counter[1].
+  unsigned index_ty_size_in_bits = index_ty->getPrimitiveSizeInBits();
+  CHECK(index_ty_size_in_bits == 32 || index_ty_size_in_bits == 64);
+  if (index_ty_size_in_bits == 32) {
+    counter_values[0] = sample_idx;
+    counter_values[1] = b->getInt32(0);
+  } else {
+    std::tie(counter_values[0], counter_values[1]) =
+        llvm_ir::SplitInt64ToInt32s(b, sample_idx);
+  }
+
+  // Xor the global state variable with the global random number seed and use
+  // the result to initialize counter[2] and counter[3].
+  std::tie(counter_values[2], counter_values[3]) = llvm_ir::SplitInt64ToInt32s(
+      b, b->CreateXor(rng_state, global_random_number));
 
-  // Seed each RNG emitter with a new 64-bit seed from the HloModule. If the
-  // compilation order is deterministic (i.e., RandomNew64 invocation order is
-  // deterministic), then the order of RNG is deterministic for a given seed and
-  // hence tests will be deterministic.
-  // If the user provides a global seed instruction then we only use 64-bits of
-  // the host's random number generator to seed the 128 bit value with the other
-  // 64-bits is due to a user specified global seed instruction.
-  // Create a GlobalVariable to maintain state between invocations. There is a
-  // bug in NVPTX with GlobalVariable and 128 bit values, so using 2 64-bit
+  // The algorithm uses a 64 bit key, which is also interpreted as two uint32
   // values.
-  llvm::GlobalVariable* state_ptr0 = new llvm::GlobalVariable(
-      /*M=*/*module_,
-      /*Ty=*/ir_builder_->getInt64Ty(),
-      /*isConstant=*/false,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(random_value_from_hlo()),
-      /*Name=*/"state_ptr0");
-
-  // When the module config seed is 0, the expected result of a prng is a random
-  // value. Instead of using the random_value_from_hlo, we need a global random
-  // value as the graph seed. This is because if we use random_value_from_hlo
-  // here, then for a newly built hlo graph, it always gives the same number.
-  uint64 graph_seed = hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
-                                                     : GlobalRandomValue();
-  llvm::GlobalVariable* state_ptr1 = new llvm::GlobalVariable(
-      /*M=*/*module_,
-      /*Ty=*/ir_builder_->getInt64Ty(),
-      /*isConstant=*/false,
-      /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
-      /*Initializer=*/ir_builder_->getInt64(graph_seed),
-      /*Name=*/"state_ptr1");
-
-  // We want each thread to use its own stream, so we modify the increment per
-  // thread. We want the increment to remain odd, so we shift the thread id left
-  // 1 and add it to the increment.
-  increment = ir_builder_->CreateAdd(increment,
-                                     ir_builder_->CreateShl(EmitThreadId(), 1));
-
-  // PCG-XSL-RR algorithm
-  // http://www.pcg-random.org/pdf/toms-oneill-pcg-family-v1.02.pdf
-  //   state = multiplier * state + increment
-  //   return uint64_t(state ^ (state >> 64))) >>> (state >> 122)
-  // where ">>>" is bitwise rotation
-  auto get_next_i64 = [=]() {
-    llvm::Value* state0 = ir_builder_->CreateZExtOrTrunc(
-        ir_builder_->CreateLoad(state_ptr0, "state0"),
-        ir_builder_->getInt128Ty());
-    llvm::Value* state1 = ir_builder_->CreateShl(
-        ir_builder_->CreateZExtOrTrunc(
-            ir_builder_->CreateLoad(state_ptr1, "state1"),
-            ir_builder_->getInt128Ty()),
-        64);
-    llvm::Value* state = ir_builder_->CreateOr(state0, state1);
-    llvm::Value* updated = ir_builder_->CreateAdd(
-        ir_builder_->CreateMul(state, multiplier), increment);
-    ir_builder_->CreateStore(
-        ir_builder_->CreateTrunc(updated, ir_builder_->getInt64Ty()),
-        state_ptr0);
-    ir_builder_->CreateStore(
-        ir_builder_->CreateTrunc(ir_builder_->CreateLShr(updated, 64),
-                                 ir_builder_->getInt64Ty()),
-        state_ptr1);
-
-    return llvm_ir::CreateRor(
-        ir_builder_->CreateTrunc(
-            ir_builder_->CreateXor(state, ir_builder_->CreateLShr(state, 64)),
-            ir_builder_->getInt64Ty()),
-        ir_builder_->CreateTrunc(ir_builder_->CreateLShr(state, 122),
-                                 ir_builder_->getInt64Ty()),
-        ir_builder_);
-  };
+  llvm::Value* key_values[2];
+
+  // Use a module random number to initialize the key.
+  std::tie(key_values[0], key_values[1]) =
+      llvm_ir::SplitInt64ToInt32s(b, hlo_random_value);
+
+  // Prepare the constants used in the Philox RNG Algorithm.
+  llvm::Value* philoxW32A = b->getInt32(0x9E3779B9);
+  llvm::Value* philoxW32B = b->getInt32(0xBB67AE85);
+  llvm::Value* philoxM4xW32A = b->getInt32(0xD2511F53);
+  llvm::Value* philoxM4xW32B = b->getInt32(0xCD9E8D57);
+
+  // Compute the 128 bit value for the current sample by repeating the
+  // single round computation and key raising computation for ten times.
+  for (int round = 0; round < 10; ++round) {
+    // A single round of computation of the counter values is as follows:
+    //  MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+    //  MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+    //  counter[0] = hi1 ^ counter[1] ^ key[0];
+    //  counter[1] = lo1;
+    //  counter[2] = hi0 ^ counter[3] ^ key[1];
+    //  counter[3] = lo0;
+    llvm::Value* lo0;
+    llvm::Value* hi0;
+    std::tie(lo0, hi0) =
+        llvm_ir::UMulLowHigh32(b, philoxM4xW32A, counter_values[0]);
+    llvm::Value* lo1;
+    llvm::Value* hi1;
+    std::tie(lo1, hi1) =
+        llvm_ir::UMulLowHigh32(b, philoxM4xW32B, counter_values[2]);
+    counter_values[0] =
+        b->CreateXor(hi1, b->CreateXor(counter_values[1], key_values[0]));
+    counter_values[1] = lo1;
+    counter_values[2] =
+        b->CreateXor(hi0, b->CreateXor(counter_values[3], key_values[1]));
+    counter_values[3] = lo0;
+    key_values[0] = b->CreateAdd(key_values[0], philoxW32A);
+    key_values[1] = b->CreateAdd(key_values[1], philoxW32B);
+  }
 
-  auto get_next_uniform_float = [=]() {
-    return ir_builder_->CreateFDiv(
-        ir_builder_->CreateUIToFP(get_next_i64(), param_ir_type),
-        llvm::ConstantFP::get(param_ir_type, 0x1p64));
-  };
+  return counter_values;
+}
 
+}  // namespace
+
+// Implements the Philox algorithm to generate random numbers in parallel.
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+//   http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+//
+// The paper presents a few variants of the Philox algorithm, we picked the
+// 4x32_10 version of the algorithm for the following reasons:
+//   . 4x32 uses 32-bit multiplication which is fast on GPUs.
+//   . The authors recommend the 10-round variant, and TensorFlow also uses it.
+//
+// Precondition: the RNG instruction is not fused.
+llvm_ir::ElementGenerator ElementalIrEmitter::MakePhiloxRngElementGenerator(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
+    const {
+  VLOG(3) << "Using philox RNG algorithm";
+  CHECK(!hlo->IsFused());
+  // A random number generated by the per module random number generator.
+  // This ensures that each RNG HLO generates a different random sequence.
+  llvm::Value* hlo_random_value = b_->getInt64(hlo->GetModule()->RandomNew64());
+  // A value specified by the configuration or generated by a global random
+  // number generator.
+  llvm::Value* global_random_number =
+      b_->getInt64(hlo_module_config_.seed() != 0 ? hlo_module_config_.seed()
+                                                  : GlobalRandomValue());
+
+  int elems_per_sample =
+      GetNumberOfElementsPerPhiloxRngSample(hlo->shape().element_type());
+
+  // Allocate stack storage for the 128 bit sample as four int32.
+  llvm::Type* int32_ty = b_->getInt32Ty();
+  llvm::Value* sample_address = llvm_ir::EmitAllocaAtFunctionEntryWithCount(
+      int32_ty, /*element_count=*/b_->getInt32(4), "sample", b_);
+
+  // Load the global state variable for the Philox RNG algorithm.
+  llvm::GlobalVariable* rng_state_ptr =
+      llvm_ir::GetOrCreateVariableForPhiloxRngState(module_, b_);
+  llvm::Value* rng_state = b_->CreateLoad(rng_state_ptr, "rng_state_value");
+
+  // Build and return the elemental IR generator to generate a random value for
+  // the element corresponding to the current thread.
+  //
+  // This elemental IR generator computes one sample with multiple random
+  // numbers but only returns one random number. As a result, neighboring
+  // threads may calculate the same sample unnecessarily. However, if the
+  // kernel containing the RNG hlo is unrolled, LLVM is able to optimize away
+  // the duplicated computation of the same sample. In particular, if the unroll
+  // factor is a multiplier of elems_per_sample, LLVM is able to completely
+  // remove such duplicated computation. If the unroll factor is a non-trivial
+  // factor of elems_per_sample, LLVM can only partially remove such duplicated
+  // computation.
   return [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    switch (hlo->random_distribution()) {
-      case RNG_UNIFORM: {
-        TF_ASSIGN_OR_RETURN(llvm::Value * p,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * q,
-                            operand_to_generator.at(hlo->operand(1))(index));
-        if (primitive_util::IsFloatingPointType(param_prim_type)) {
-          return ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(ir_builder_->CreateFSub(q, p),
-                                      get_next_uniform_float()),
-              p);
-        } else {
-          auto r = ir_builder_->CreateSub(q, p);
-          auto leading_zeros = llvm_ir::EmitCallToIntrinsic(
-              llvm::Intrinsic::ctlz, {r, ir_builder_->getInt1(true)},
-              {param_ir_type}, ir_builder_);
-          auto in_block = ir_builder_->GetInsertBlock();
-
-          // A terminator should be present iff we're emitting code
-          // into the middle (as opposed to the end) of a basic block.
-          CHECK_EQ(ir_builder_->GetInsertPoint() == in_block->end(),
-                   in_block->getTerminator() == nullptr);
-
-          llvm::BasicBlock* body_block;
-          llvm::BasicBlock* out_block;
-
-          if (ir_builder_->GetInsertPoint() == in_block->end()) {
-            body_block = llvm_ir::CreateBasicBlock(
-                nullptr, IrName(hlo, "rng_body"), ir_builder_);
-            out_block = llvm_ir::CreateBasicBlock(
-                nullptr, IrName(hlo, "rng_out"), ir_builder_);
-            llvm::BranchInst::Create(body_block, in_block);
-          } else {
-            body_block = in_block->splitBasicBlock(
-                ir_builder_->GetInsertPoint(), "rng_body");
-            out_block = body_block->splitBasicBlock(
-                ir_builder_->GetInsertPoint(), "rng_out");
-            body_block->getTerminator()->eraseFromParent();
-          }
-
-          SetToFirstInsertPoint(body_block, ir_builder_);
-          auto random = ir_builder_->CreateAnd(
-              ir_builder_->CreateZExtOrTrunc(get_next_i64(), param_ir_type),
-              ir_builder_->CreateLShr(llvm::ConstantInt::get(param_ir_type, ~0),
-                                      leading_zeros));
-          llvm::BranchInst::Create(out_block, body_block,
-                                   ir_builder_->CreateICmpULT(random, r),
-                                   body_block);
-          SetToFirstInsertPoint(out_block, ir_builder_);
-          return ir_builder_->CreateAdd(
-              p, ir_builder_->CreateSelect(
-                     ir_builder_->CreateICmpEQ(p, q),
-                     llvm::ConstantInt::get(param_ir_type, 0), random));
-        }
-      }
-      case RNG_NORMAL: {
-        TF_ASSIGN_OR_RETURN(llvm::Value * m,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * s,
-                            operand_to_generator.at(hlo->operand(1))(index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * r,
-            EmitErfcInv(param_prim_type,
-                        ir_builder_->CreateFMul(
-                            llvm::ConstantFP::get(param_ir_type, 2.0),
-                            get_next_uniform_float())));
-        return ir_builder_->CreateFAdd(ir_builder_->CreateFMul(r, s), m);
-      }
-      default:
-        return InvalidArgument(
-            "unhandled distribution %s",
-            RandomDistribution_Name(hlo->random_distribution()).c_str());
+    llvm::Type* index_ty = index.GetType();
+    // Calculate the linear element index.
+    llvm::Value* elem_idx = index.linear();
+    if (elem_idx == nullptr) {
+      elem_idx = index.Linearize(AsInt64Slice(hlo->shape().dimensions()), b_);
     }
+
+    // Calculate the index for the 128 bit sample and the offset of the current
+    // element within the sample.
+    llvm::Value* elems_per_sample_value =
+        llvm::ConstantInt::get(index_ty, elems_per_sample);
+    llvm::Value* sample_idx = b_->CreateUDiv(elem_idx, elems_per_sample_value);
+    llvm::Value* elem_offset = b_->CreateURem(elem_idx, elems_per_sample_value);
+
+    std::array<llvm::Value*, 4> counter_values = CalculateSampleValues(
+        sample_idx, hlo_random_value, global_random_number, rng_state, b_);
+
+    // Store the four counter_values into the sample_address alloca so we can
+    // load the elem_offset'th one below.
+    for (int idx = 0; idx < 4; ++idx) {
+      b_->CreateStore(counter_values[idx],
+                      b_->CreateInBoundsGEP(sample_address, b_->getInt32(idx)));
+    }
+
+    llvm::Type* int64_ty = b_->getInt64Ty();
+    CHECK(elems_per_sample == 2 || elems_per_sample == 4);
+    llvm::Type* raw_value_ty = elems_per_sample == 2 ? int64_ty : int32_ty;
+    // Retrieve the raw value for the current element from the current sample.
+    llvm::Value* raw_elem_value = b_->CreateLoad(
+        b_->CreateInBoundsGEP(
+            b_->CreatePointerCast(sample_address, raw_value_ty->getPointerTo()),
+            elem_offset),
+        "raw_elem_value");
+
+    return ConvertValueForDistribution(hlo, operand_to_generator, index,
+                                       raw_elem_value);
   };
 }
 
@@ -1447,9 +1487,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
   TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
                       operand_to_generator.at(hlo->operand(2))(
                           ElementwiseSourceIndex(index, *hlo, 2)));
-  return ir_builder_->CreateSelect(
-      ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
-      on_true_value, on_false_value);
+  return b_->CreateSelect(b_->CreateTrunc(pred_value, b_->getInt1Ty()),
+                          on_true_value, on_false_value);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
@@ -1485,64 +1524,62 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
   const int64 concat_dim = hlo->dimensions(0);
   auto source_index = target_index;
 
-  llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
+  llvm::BasicBlock* init_block = b_->GetInsertBlock();
 
   // A terminator should be present iff we're emitting code
   // into the middle (as opposed to the end) of a basic block.
-  CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
+  CHECK_EQ(b_->GetInsertPoint() == init_block->end(),
            init_block->getTerminator() == nullptr);
 
   llvm::BasicBlock* exit_block;
-  if (ir_builder_->GetInsertPoint() == init_block->end()) {
+  if (b_->GetInsertPoint() == init_block->end()) {
     exit_block = llvm_ir::CreateBasicBlock(
-        /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
+        /*insert_before=*/nullptr, IrName(hlo, "merge"), b_);
   } else {
-    exit_block = init_block->splitBasicBlock(ir_builder_->GetInsertPoint(),
+    exit_block = init_block->splitBasicBlock(b_->GetInsertPoint(),
                                              AsStringRef(IrName(hlo, "merge")));
     init_block->getTerminator()->eraseFromParent();
   }
 
-  llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
-  llvm::PHINode* output = ir_builder_->CreatePHI(
+  llvm_ir::SetToFirstInsertPoint(exit_block, b_);
+  llvm::PHINode* output = b_->CreatePHI(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
       hlo->operands().size());
-  auto prior_insert_point = ir_builder_->GetInsertPoint();
+  auto prior_insert_point = b_->GetInsertPoint();
 
-  ir_builder_->SetInsertPoint(init_block);
+  b_->SetInsertPoint(init_block);
 
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
     auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx),
-        ir_builder_);
+        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_not_from_operand", operand_idx),
-        ir_builder_);
+        exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    ir_builder_->CreateCondBr(
-        ir_builder_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
+    b_->CreateCondBr(
+        b_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
         true_block, false_block);
 
     // Create the terminator of the true block before calling operand
     // generators, because they require non-degenerate basic blocks.
-    ir_builder_->SetInsertPoint(
+    b_->SetInsertPoint(
         llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
     TF_ASSIGN_OR_RETURN(llvm::Value * value,
                         operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, ir_builder_->GetInsertBlock());
+    output->addIncoming(value, b_->GetInsertBlock());
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
-    ir_builder_->SetInsertPoint(false_block);
+    b_->SetInsertPoint(false_block);
     source_index[concat_dim] =
-        ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
+        b_->CreateSub(source_index[concat_dim], concat_dim_size);
   }
 
-  ir_builder_->CreateUnreachable();
-  ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
+  b_->CreateUnreachable();
+  b_->SetInsertPoint(exit_block, prior_insert_point);
   return output;
 }
 
@@ -1566,12 +1603,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
 
     // Clamp the start index so that the sliced portion fits in the operand:
     // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    start_index_value =
-        ir_builder_->CreateSExtOrTrunc(start_index_value, index_type);
+    start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type);
     int64 largest_valid_start_index =
         input_hlo->shape().dimensions(i) - hlo->shape().dimensions(i);
     CHECK_GE(largest_valid_start_index, 0);
@@ -1591,7 +1623,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
   for (int64 i = 0; i < rank; ++i) {
     // Emit IR which computes:
     //   input_index = start_index + offset_index
-    input_index[i] = ir_builder_->CreateAdd(slice_start_index[i], index[i]);
+    input_index[i] = b_->CreateAdd(slice_start_index[i], index[i]);
   }
   return operand_to_generator.at(input_hlo)(input_index);
 }
@@ -1649,7 +1681,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
 
   auto add_to_operand_index = [&](llvm::Value* index_component, int64 dim) {
     llvm::Value* gather_dim_component_extended =
-        ir_builder_->CreateSExtOrTrunc(index_component, index_type);
+        b_->CreateSExtOrTrunc(index_component, index_type);
     int64 operand_dim = dim_numbers.gather_dims_to_operand_dims(dim);
     int64 output_dim = operand_to_output_dim[operand_dim];
     // If 'output_dim' is -1, it means 'operand_dim' is an elided window dim.
@@ -1673,7 +1705,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
                         gather_dim_component_extended, is_signed),
         is_signed);
 
-    operand_index[operand_dim] = ir_builder_->CreateAdd(
+    operand_index[operand_dim] = b_->CreateAdd(
         operand_index[operand_dim], gather_dim_component_extended_inbound);
   };
 
@@ -1708,7 +1740,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   llvm_ir::IrArray::Index slice_limit_index(index.GetType(), rank);
   // Slice intersection gathers (ANDs) conditions on all ranks for which
   // 'input' is set to 'update'
-  llvm::Value* slice_intersection = ir_builder_->getTrue();
+  llvm::Value* slice_intersection = b_->getTrue();
 
   for (int64 i = 0; i < rank; ++i) {
     llvm::Type* index_type = index[0]->getType();
@@ -1721,12 +1753,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    start_index_value =
-        ir_builder_->CreateSExtOrTrunc(start_index_value, index_type);
+    start_index_value = b_->CreateSExtOrTrunc(start_index_value, index_type);
     llvm::Value* update_dim_size =
         index_typed_const(update_hlo->shape().dimensions(i));
     int64 largest_valid_start_index =
@@ -1742,16 +1769,13 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
     start_index_value->setName(
         AsStringRef(IrName(hlo, StrCat("start_idx", i))));
     slice_start_index[i] = start_index_value;
-    slice_limit_index[i] =
-        ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
+    slice_limit_index[i] = b_->CreateAdd(slice_start_index[i], update_dim_size);
 
-    slice_intersection = ir_builder_->CreateAnd(
-        slice_intersection,
-        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
+    slice_intersection = b_->CreateAnd(
+        slice_intersection, b_->CreateICmpSGE(index[i], slice_start_index[i]),
         "slice_intersection");
-    slice_intersection = ir_builder_->CreateAnd(
-        slice_intersection,
-        ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
+    slice_intersection = b_->CreateAnd(
+        slice_intersection, b_->CreateICmpSLT(index[i], slice_limit_index[i]),
         "slice_intersection");
   }
 
@@ -1760,29 +1784,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
   // else                    -> return data from 'input'.
   llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
-      "ret_value_addr", ir_builder_);
-  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-      slice_intersection, "slice_intersection", ir_builder_);
+      "ret_value_addr", b_);
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(slice_intersection, "slice_intersection", b_);
 
   // Handle true BB (return data from 'update')
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   // Compute update index for intersection case.
   llvm_ir::IrArray::Index update_index(index.GetType(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    update_index[i] = ir_builder_->CreateSub(index[i], slice_start_index[i]);
+    update_index[i] = b_->CreateSub(index[i], slice_start_index[i]);
   }
   TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
                       operand_to_generator.at(update_hlo)(update_index));
-  ir_builder_->CreateStore(true_value, ret_value_addr);
+  b_->CreateStore(true_value, ret_value_addr);
 
   // Handle false BB (return data from 'input')
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
                       operand_to_generator.at(input_hlo)(index));
-  ir_builder_->CreateStore(false_value, ret_value_addr);
+  b_->CreateStore(false_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-  return ir_builder_->CreateLoad(ret_value_addr);
+  SetToFirstInsertPoint(if_data.after_block, b_);
+  return b_->CreateLoad(ret_value_addr);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
@@ -1790,29 +1814,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& padded_index) const {
   auto index = padded_index;
-  llvm::Value* in_bounds = ir_builder_->getTrue();
+  llvm::Value* in_bounds = b_->getTrue();
   for (size_t i = 0; i < index.size(); ++i) {
     auto index_typed_const = [=](int64 n) {
       return llvm::ConstantInt::get(index[i]->getType(), n);
     };
     const auto& pad_dim = hlo->padding_config().dimensions(i);
-    index[i] = ir_builder_->CreateSub(
-        index[i], index_typed_const(pad_dim.edge_padding_low()));
-    in_bounds = ir_builder_->CreateAnd(
-        in_bounds, ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
-        "in_bounds");
-    in_bounds = ir_builder_->CreateAnd(
+    index[i] =
+        b_->CreateSub(index[i], index_typed_const(pad_dim.edge_padding_low()));
+    in_bounds = b_->CreateAnd(in_bounds,
+                              b_->CreateICmpSGE(index[i], index_typed_const(0)),
+                              "in_bounds");
+    in_bounds = b_->CreateAnd(
         in_bounds,
-        ir_builder_->CreateICmpEQ(
+        b_->CreateICmpEQ(
             index_typed_const(0),
-            ir_builder_->CreateURem(
-                index[i], index_typed_const(pad_dim.interior_padding() + 1))),
+            b_->CreateURem(index[i],
+                           index_typed_const(pad_dim.interior_padding() + 1))),
         "in_bounds");
-    index[i] = ir_builder_->CreateSDiv(
+    index[i] = b_->CreateSDiv(
         index[i], index_typed_const(pad_dim.interior_padding() + 1));
-    in_bounds = ir_builder_->CreateAnd(
+    in_bounds = b_->CreateAnd(
         in_bounds,
-        ir_builder_->CreateICmpSLT(
+        b_->CreateICmpSLT(
             index[i],
             index_typed_const(hlo->operand(0)->shape().dimensions(i))),
         "in_bounds");
@@ -1825,26 +1849,26 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
   // }
   llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
-      "pad_result_addr", ir_builder_);
+      "pad_result_addr", b_);
   llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
                       operand_to_generator.at(hlo->operand(0))(index));
-  ir_builder_->CreateStore(operand_value, ret_value_addr);
+  b_->CreateStore(operand_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.false_block, b_);
   TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
                       operand_to_generator.at(hlo->operand(1))(
                           IrArray::Index(index.GetType())));
-  ir_builder_->CreateStore(padding_value, ret_value_addr);
+  b_->CreateStore(padding_value, ret_value_addr);
 
-  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  SetToFirstInsertPoint(if_data.after_block, b_);
   // Don't create phi(operand_value, padding_value) here, because invoking
   // operand_to_generator may create new basic blocks, making the parent
   // of operand_value or padding_value no longer a predecessor of
   // if_data.after_block.
-  return ir_builder_->CreateLoad(ret_value_addr);
+  return b_->CreateLoad(ret_value_addr);
 }
 
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
@@ -1868,21 +1892,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
     return llvm::ConstantInt::get(index_type, c);
   };
 
-  std::unique_ptr<llvm_ir::ForLoop> inner_loop =
-      llvm_ir::ForLoop::EmitForLoop(IrName(hlo, "inner"), index_typed_const(0),
-                                    index_typed_const(contracted_dim_size),
-                                    index_typed_const(1), ir_builder_);
+  std::unique_ptr<llvm_ir::ForLoop> inner_loop = llvm_ir::ForLoop::EmitForLoop(
+      IrName(hlo, "inner"), index_typed_const(0),
+      index_typed_const(contracted_dim_size), index_typed_const(1), b_);
 
-  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), b_);
   PrimitiveType primitive_type = hlo->shape().element_type();
   llvm::Type* primitive_type_llvm =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
-  llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
-      primitive_type_llvm, "dot_acc", ir_builder_);
-  ir_builder_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
-                           accumulator_alloca);
+  llvm::Value* accumulator_alloca =
+      llvm_ir::EmitAllocaAtFunctionEntry(primitive_type_llvm, "dot_acc", b_);
+  b_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
+                  accumulator_alloca);
 
-  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
+  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), b_);
 
   // This is the inner reduction loop for a dot operation that produces
   // one element in the output.  If the operands to the dot operation have
@@ -1902,43 +1925,36 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   }
   rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue());
 
-  llvm::Value* current_accumulator =
-      ir_builder_->CreateLoad(accumulator_alloca);
+  llvm::Value* current_accumulator = b_->CreateLoad(accumulator_alloca);
   TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
   TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
   llvm::Value* next_accumulator;
   if (primitive_util::IsComplexType(primitive_type)) {
-    llvm::Value* product_real = ir_builder_->CreateFSub(
-        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                EmitExtractReal(rhs_value)),
-        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                EmitExtractImag(rhs_value)));
-    llvm::Value* product_imag = ir_builder_->CreateFAdd(
-        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                EmitExtractImag(rhs_value)),
-        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                EmitExtractReal(rhs_value)));
-    next_accumulator = ir_builder_->CreateInsertValue(
+    llvm::Value* product_real = b_->CreateFSub(
+        b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractReal(rhs_value)),
+        b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractImag(rhs_value)));
+    llvm::Value* product_imag = b_->CreateFAdd(
+        b_->CreateFMul(EmitExtractReal(lhs_value), EmitExtractImag(rhs_value)),
+        b_->CreateFMul(EmitExtractImag(lhs_value), EmitExtractReal(rhs_value)));
+    next_accumulator = b_->CreateInsertValue(
         current_accumulator,
-        ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
-                                product_real),
+        b_->CreateFAdd(EmitExtractReal(current_accumulator), product_real),
         {0});
-    next_accumulator = ir_builder_->CreateInsertValue(
+    next_accumulator = b_->CreateInsertValue(
         next_accumulator,
-        ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
-                                product_imag),
+        b_->CreateFAdd(EmitExtractImag(current_accumulator), product_imag),
         {1});
   } else if (primitive_util::IsFloatingPointType(primitive_type)) {
-    next_accumulator = ir_builder_->CreateFAdd(
-        current_accumulator, ir_builder_->CreateFMul(lhs_value, rhs_value));
+    next_accumulator = b_->CreateFAdd(current_accumulator,
+                                      b_->CreateFMul(lhs_value, rhs_value));
   } else {
-    next_accumulator = ir_builder_->CreateAdd(
-        current_accumulator, ir_builder_->CreateMul(lhs_value, rhs_value));
+    next_accumulator =
+        b_->CreateAdd(current_accumulator, b_->CreateMul(lhs_value, rhs_value));
   }
-  ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
+  b_->CreateStore(next_accumulator, accumulator_alloca);
 
-  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
-  return ir_builder_->CreateLoad(accumulator_alloca);
+  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), b_);
+  return b_->CreateLoad(accumulator_alloca);
 }
 
 llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
@@ -2038,7 +2054,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         auto source_index = target_index;
         for (int64 dim : hlo->dimensions()) {
-          source_index[dim] = ir_builder_->CreateSub(
+          source_index[dim] = b_->CreateSub(
               llvm::ConstantInt::get(target_index[dim]->getType(),
                                      hlo->shape().dimensions(dim) - 1),
               target_index[dim]);
@@ -2051,16 +2067,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
         const HloInstruction* operand = hlo->operand(0);
         // The `dimensions` member of the broadcast instruction maps from
         // input dimensions to output dimensions.
-        return operand_to_generator.at(
-            operand)(target_index.SourceIndexOfBroadcast(
-            hlo->shape(), operand->shape(), hlo->dimensions(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            target_index.SourceIndexOfBroadcast(hlo->shape(), operand->shape(),
+                                                hlo->dimensions(), b_));
       };
     case HloOpcode::kSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         IrArray::Index sliced_index = index.SourceIndexOfSlice(
             /*shape=*/hlo->shape(), /*starts=*/hlo->slice_starts(),
-            /*strides=*/hlo->slice_strides(), /*builder=*/ir_builder_);
+            /*strides=*/hlo->slice_strides(), /*builder=*/b_);
         return operand_to_generator.at(hlo->operand(0))(sliced_index);
       };
     case HloOpcode::kDynamicSlice:
@@ -2085,27 +2101,26 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                ShapeUtil::ElementsIn(hlo->operand(0)->shape()));
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         const HloInstruction* operand = hlo->operand(0);
-        return operand_to_generator.at(operand)(index.SourceIndexOfBitcast(
-            hlo->shape(), operand->shape(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            index.SourceIndexOfBitcast(hlo->shape(), operand->shape(), b_));
       };
     case HloOpcode::kReshape:
       CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()),
                ShapeUtil::ElementsIn(hlo->operand(0)->shape()));
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         const HloInstruction* operand = hlo->operand(0);
-        return operand_to_generator.at(operand)(index.SourceIndexOfReshape(
-            hlo->shape(), operand->shape(), ir_builder_));
+        return operand_to_generator.at(operand)(
+            index.SourceIndexOfReshape(hlo->shape(), operand->shape(), b_));
       };
     case HloOpcode::kTranspose:
       return [this, hlo,
               &operand_to_generator](const IrArray::Index& target_index) {
         return operand_to_generator.at(hlo->operand(0))(
             target_index.SourceIndexOfTranspose(
-                hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(),
-                ir_builder_));
+                hlo->shape(), hlo->operand(0)->shape(), hlo->dimensions(), b_));
       };
     case HloOpcode::kRng:
-      return MakeRngElementGenerator(hlo, operand_to_generator);
+      return MakePhiloxRngElementGenerator(hlo, operand_to_generator);
     case HloOpcode::kPad:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
@@ -2127,11 +2142,11 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
 }
 
 llvm::Value* ElementalIrEmitter::EmitExtractReal(llvm::Value* value) const {
-  return ir_builder_->CreateExtractValue(value, {0});
+  return b_->CreateExtractValue(value, {0});
 }
 
 llvm::Value* ElementalIrEmitter::EmitExtractImag(llvm::Value* value) const {
-  return ir_builder_->CreateExtractValue(value, {1});
+  return b_->CreateExtractValue(value, {1});
 }
 
 llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
@@ -2139,10 +2154,10 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
                                                     llvm::Value* imag) const {
   auto cplx_type =
       llvm_ir::PrimitiveTypeToIrType(op->shape().element_type(), module_);
-  auto complex = ir_builder_->CreateInsertValue(
+  auto complex = b_->CreateInsertValue(
       llvm::ConstantAggregateZero::get(cplx_type), real, {0});
   if (imag != nullptr) {
-    complex = ir_builder_->CreateInsertValue(complex, imag, {1});
+    complex = b_->CreateInsertValue(complex, imag, {1});
   }
   return complex;
 }
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index d199473374ad394913413a7d3fe805f8782936f7..fcb34557a52d35ef30a5dee643171e17407d05c2 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -34,10 +34,8 @@ class ElementalIrEmitter {
       std::unordered_map<const HloInstruction*, llvm_ir::ElementGenerator>;
 
   ElementalIrEmitter(const HloModuleConfig& hlo_module_config,
-                     llvm::Module* module, llvm::IRBuilder<>* ir_builder)
-      : ir_builder_(ir_builder),
-        module_(module),
-        hlo_module_config_(hlo_module_config) {}
+                     llvm::Module* module, llvm::IRBuilder<>* b)
+      : b_(b), module_(module), hlo_module_config_(hlo_module_config) {}
 
   virtual ~ElementalIrEmitter() = default;
 
@@ -54,7 +52,7 @@ class ElementalIrEmitter {
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator) const;
 
-  llvm::IRBuilder<>* ir_builder() const { return ir_builder_; }
+  llvm::IRBuilder<>* b() const { return b_; }
   llvm::Module* module() const { return module_; }
 
  protected:
@@ -144,9 +142,7 @@ class ElementalIrEmitter {
       int64 operand_no) const;
 
   // Identifier of the thread unique among all threads on the device
-  virtual llvm::Value* EmitThreadId() const {
-    return ir_builder_->getIntN(128, 0);
-  }
+  virtual llvm::Value* EmitThreadId() const { return b_->getIntN(128, 0); }
 
   StatusOr<llvm::Value*> EmitElementalSelect(
       const HloInstruction* hlo,
@@ -188,7 +184,7 @@ class ElementalIrEmitter {
       const HloToElementGeneratorMap& operand_to_generator,
       const llvm_ir::IrArray::Index& dot_result_index) const;
 
-  llvm::IRBuilder<>* const ir_builder_;
+  llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
 
@@ -197,10 +193,17 @@ class ElementalIrEmitter {
   const HloModuleConfig& hlo_module_config_;
 
  private:
-  // Returns a ElementGenerator for a RNG HloInstruction.
-  llvm_ir::ElementGenerator MakeRngElementGenerator(
+  // Returns a ElementGenerator for an RNG HloInstruction using the Philox
+  // random number generation algorithm.
+  llvm_ir::ElementGenerator MakePhiloxRngElementGenerator(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator) const;
+  // Converts the raw value generated by a random number generation algorithm
+  // to the distribution requested by the RNG HloInstruction.
+  StatusOr<llvm::Value*> ConvertValueForDistribution(
+      const HloInstruction* hlo,
+      const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index, llvm::Value* raw_value) const;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 7cf2746947846d1fa729a34b324dd442143728f1..fd75847d0c0e737957401b8efc420d504a3c0706 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -82,7 +82,18 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
 
   StatusOr<ScopedShapedBuffer> return_value =
       ExecuteOnStream(run_options, arguments, profile_ptr.get());
-  TF_RETURN_IF_ERROR(return_value.status());
+  if (!return_value.status().ok()) {
+    if (profile != nullptr) {
+      // Ensure the ThenStartTimer call has completed before we destroy timer.
+      // We already have a failure status to return, so just log this if it
+      // fails.
+      Status status = stream->BlockHostUntilDone();
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to BlockHostUntilDone: " << status;
+      }
+    }
+    return return_value.status();
+  }
 
   if (profile != nullptr) {
     VLOG(1) << "enqueueing 'stop timer' and blocking host until done...";
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index 6794cfe297b0fb9a15eb9b7e6906d225f9597d07..228c3fac95c3114484637bd93ec51c60b44403cc 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -25,7 +25,7 @@ limitations under the License.
 namespace xla {
 
 AsyncExecution::AsyncExecution(Backend* backend,
-                               std::vector<Backend::StreamPtr> streams,
+                               std::vector<StreamPool::Ptr> streams,
                                const ExecutionProfile& profile,
                                GlobalDataHandle result)
     : backend_(CHECK_NOTNULL(backend)),
@@ -46,9 +46,10 @@ Status AsyncExecution::BlockUntilDone() const {
 
 ExecutionTracker::ExecutionTracker() : next_handle_(1) {}
 
-ExecutionHandle ExecutionTracker::Register(
-    Backend* backend, std::vector<Backend::StreamPtr> streams,
-    const ExecutionProfile& profile, GlobalDataHandle result) {
+ExecutionHandle ExecutionTracker::Register(Backend* backend,
+                                           std::vector<StreamPool::Ptr> streams,
+                                           const ExecutionProfile& profile,
+                                           GlobalDataHandle result) {
   tensorflow::mutex_lock lock(execution_mutex_);
   int64 handle = next_handle_++;
   auto inserted = handle_to_execution_.emplace(
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 4458152dd9a98890fc3a3e7f324245ec68821467..4e9b9f883e26f5564a9c63a40d2b4b9348908214 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,7 +40,7 @@ namespace xla {
 // the stream when destructed.
 class AsyncExecution {
  public:
-  AsyncExecution(Backend* backend, std::vector<Backend::StreamPtr> streams,
+  AsyncExecution(Backend* backend, std::vector<StreamPool::Ptr> streams,
                  const ExecutionProfile& profile, GlobalDataHandle result);
 
   Status BlockUntilDone() const;
@@ -54,7 +54,7 @@ class AsyncExecution {
   Backend* backend_;
 
   // Stream on which the execution is launched.
-  std::vector<Backend::StreamPtr> streams_;
+  std::vector<StreamPool::Ptr> streams_;
 
   // Profile object of the execution to be returned to the user.
   ExecutionProfile profile_;
@@ -72,7 +72,7 @@ class ExecutionTracker {
   // Registers an execution with its backend, streams, and data handle to the
   // execution result. Returns a handle for the registered execution.
   ExecutionHandle Register(Backend* backend,
-                           std::vector<Backend::StreamPtr> stream,
+                           std::vector<StreamPool::Ptr> stream,
                            const ExecutionProfile& profile,
                            GlobalDataHandle data);
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index a043795a21b2da8af742972f735d76deeea91ef4..a73a341fdb1699184ecb9580cbd029dd2411b109 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -36,6 +36,7 @@ cc_library(
     hdrs = ["gpu_constants.h"],
     deps = [
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:framework",
     ],
 )
 
@@ -113,11 +114,13 @@ cc_library(
     srcs = ["hlo_to_ir_bindings.cc"],
     hdrs = ["hlo_to_ir_bindings.h"],
     deps = [
+        ":buffer_allocations",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
@@ -141,6 +144,7 @@ cc_library(
     ],
     deps = [
         ":backend_configs",
+        ":buffer_allocations",
         ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
@@ -162,6 +166,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:name_uniquer",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/service/llvm_ir:dynamic_update_slice_util",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
@@ -170,6 +175,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/compiler/xla/service/llvm_ir:sort_util",
         "//tensorflow/compiler/xla/service/llvm_ir:tuple_ops",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -215,6 +221,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
+        "//tensorflow/compiler/xla/service/llvm_ir:math_ops",
         "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
@@ -245,7 +252,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:pool",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -268,6 +275,7 @@ cc_library(
         "memset_thunk.cc",
         "outfeed_thunk.cc",
         "sequential_thunk.cc",
+        "thunk.cc",
         "thunk_schedule.cc",
         "tuple_thunk.cc",
         "while_thunk.cc",
@@ -319,6 +327,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -448,6 +457,7 @@ cc_library(
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
     deps = [
+        ":instruction_fusion",
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -460,6 +470,7 @@ tf_cc_test(
     name = "multi_output_fusion_test",
     srcs = ["multi_output_fusion_test.cc"],
     deps = [
+        ":instruction_fusion",
         ":multi_output_fusion",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
@@ -803,6 +814,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:stream_executor_no_cuda",
     ],
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index ab5149dcdb09290cd0c0b2233029d0988a95f036..537295292b6ced72c4b2c456557b3c06e0aa5254 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -44,17 +44,27 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
       num_buffers, device_ordinal, memory_allocator, buffer_assignment));
 
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
+    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
+    const int64 expected_alignment = [&] {
+      if (allocation.is_entry_computation_parameter()) {
+        return kEntryParameterAlignBytes;
+      } else if (allocation.is_constant()) {
+        return kConstantBufferAlignBytes;
+      } else {
+        return kXlaAllocatedBufferAlignBytes;
+      }
+    }();
+
     // If buffer #i's address is already registered (e.g. external arguments or
     // result buffers), use that registered buffer.
     if (registered_buffers_.count(i)) {
       se::DeviceMemoryBase address = FindOrDie(registered_buffers_, i);
-      if (reinterpret_cast<uintptr_t>(address.opaque()) %
-              kCudaMallocAlignBytes !=
+      if (reinterpret_cast<uintptr_t>(address.opaque()) % expected_alignment !=
           0) {
         return InternalError(
             "Address of registered buffer %lld must be a multiple of %llx, but "
             "was %p",
-            i, kCudaMallocAlignBytes, address.opaque());
+            i, kEntryParameterAlignBytes, address.opaque());
       }
       buffer_allocations->SetBuffer(i, FindOrDie(registered_buffers_, i));
       continue;
@@ -62,7 +72,6 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // Allocate each allocation that might escape, or is the temp buffer.
     bool seen_temp_buffer = false;
-    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
     if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
@@ -70,13 +79,12 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
         OwningDeviceMemory buffer;
         TF_ASSIGN_OR_RETURN(
             buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
-        if (reinterpret_cast<uintptr_t>(buffer.opaque()) %
-                kCudaMallocAlignBytes !=
+        if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
             0) {
           return InternalError(
               "Address returned by memory_allocator->Allocate must be a "
               "multiple of %llx, but was %p",
-              kCudaMallocAlignBytes, buffer.opaque());
+              kXlaAllocatedBufferAlignBytes, buffer.opaque());
         }
         // We do manual memory management within BufferAllocations.  Be sure not
         // to do a TF_RETURN_IF_ERROR between this line and the
@@ -165,5 +173,10 @@ void BufferAllocations::SetBuffer(BufferAllocation::Index buffer_index,
   buffers_[buffer_index] = buffer;
 }
 
+bool ShouldEmitLiteralInLlvmIr(const Literal& literal) {
+  // LLVM can sometimes do interesting optimizations using scalar constants.
+  return ShapeUtil::IsScalar(literal.shape());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index 636623502597b3a66523938ba430e9d5a82f796c..f13eab0dd787a2bfa687c991f9d808568360fd24 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -107,6 +107,12 @@ class BufferAllocations {
   bool torn_down_ = false;
 };
 
+// LLVM and PTXAS don't deal well with large constants, so we only emit very
+// small constants directly in LLVM IR.  Larger constants are emitted with zero
+// initializers in LLVM IR and are later overwritten when the PTX/CUBIN is
+// loaded.
+bool ShouldEmitLiteralInLlvmIr(const Literal& literal);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index e594cec2f8d5743b742e07db70a71cc81279f9b7..cc38db27e2680e950f74e104cef8829585c7b81c 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -67,8 +68,8 @@ bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
-    llvm::IRBuilder<>* ir_builder, NestedComputer compute_nested)
-    : ElementalIrEmitter(hlo_module_config, module, ir_builder),
+    llvm::IRBuilder<>* b, NestedComputer compute_nested)
+    : ElementalIrEmitter(hlo_module_config, module, b),
       hlo_module_config_(hlo_module_config),
       compute_nested_(std::move(compute_nested)) {}
 
@@ -92,8 +93,8 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
       cast_result_to_fp16 = true;
       for (int64 i = 0; i < operands.size(); ++i) {
         if (input_types[i] == F16) {
-          converted_operands[i] = ir_builder_->CreateFPCast(
-              converted_operands[i], ir_builder_->getFloatTy());
+          converted_operands[i] =
+              b_->CreateFPCast(converted_operands[i], b_->getFloatTy());
           converted_input_types[i] = F32;
         }
       }
@@ -112,7 +113,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLibdeviceMathCall(
                                      converted_input_types, output_type)
                             .ValueOrDie();
   if (cast_result_to_fp16) {
-    result = ir_builder_->CreateFPCast(result, ir_builder_->getHalfTy());
+    result = b_->CreateFPCast(result, b_->getHalfTy());
   }
   return result;
 }
@@ -215,7 +216,7 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPowerOp(
     // LLVM's NVPTX backend knows how to transform 1/sqrt(A) into the NVPTX
     // rsqrt.approx instruction.
     TF_ASSIGN_OR_RETURN(auto* sqrt, make_sqrt());
-    return ir_builder_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
+    return b_->CreateFDiv(llvm::ConstantFP::get(llvm_ty, 1), sqrt);
   }
 
   VLOG(10) << "emitting pow as regular call to pow(): " << op->ToString();
@@ -277,6 +278,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatUnaryOp(
   PrimitiveType output_type = op->shape().element_type();
   switch (op->opcode()) {
     case HloOpcode::kTanh:
+      // If we don't care much about precision, emit a fast approximation of
+      // tanh.
+      if (hlo_module_config_.debug_options().xla_enable_fast_math()) {
+        // Upcast F16 to F32 if necessary.
+        llvm::Type* type =
+            input_type == F16 ? b_->getFloatTy() : operand_value->getType();
+        llvm::Value* input = b_->CreateFPCast(operand_value, type);
+        llvm::Value* fast_tanh = llvm_ir::EmitFastTanh(b_, input);
+        return b_->CreateFPCast(fast_tanh, operand_value->getType());
+      }
       return EmitLibdeviceMathCall("__nv_tanh", {operand_value}, {input_type},
                                    output_type);
     default:
@@ -302,32 +313,31 @@ llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall(
 
   // Declares the callee if it is not declared already.
   llvm::Function* callee = llvm::cast<llvm::Function>(
-      ir_builder_->GetInsertBlock()->getModule()->getOrInsertFunction(
+      b_->GetInsertBlock()->getModule()->getOrInsertFunction(
           llvm_ir::AsStringRef(callee_name), callee_type));
 
   for (auto attribute : attributes) {
     callee->addFnAttr(attribute);
   }
 
-  return ir_builder_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
+  return b_->CreateCall(callee, llvm_ir::AsArrayRef(operands));
 }
 
 llvm::Value* GpuElementalIrEmitter::EmitThreadId() const {
-  llvm::Value* block_id = ir_builder_->CreateIntCast(
+  llvm::Value* block_id = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "block.id");
-  llvm::Value* thread_id_in_block = ir_builder_->CreateIntCast(
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "block.id");
+  llvm::Value* thread_id_in_block = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "thread.id");
-  llvm::Value* threads_per_block = ir_builder_->CreateIntCast(
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "thread.id");
+  llvm::Value* threads_per_block = b_->CreateIntCast(
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x,
-                                   {}, {}, ir_builder_),
-      ir_builder_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
-  return ir_builder_->CreateNSWAdd(
-      ir_builder_->CreateNSWMul(block_id, threads_per_block),
-      thread_id_in_block);
+                                   {}, {}, b_),
+      b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
+  return b_->CreateNSWAdd(b_->CreateNSWMul(block_id, threads_per_block),
+                          thread_id_in_block);
 }
 
 llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
@@ -373,12 +383,12 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         PrimitiveType operand_element_type = operand->shape().element_type();
         llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
             llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-            "reduce_window_accum_ptr", ir_builder_);
+            "reduce_window_accum_ptr", b_);
         {
           TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
                               operand_to_generator.at(hlo->operand(1))(
                                   IrArray::Index(index.GetType())));
-          ir_builder_->CreateStore(init_value, accum_ptr);
+          b_->CreateStore(init_value, accum_ptr);
         }
 
         llvm::Type* index_type = index.GetType();
@@ -386,7 +396,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           return index.GetConstantWithIndexType(c);
         };
 
-        llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_, index_type);
+        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
         std::vector<int64> window_size;
         for (const auto& dim : window.dimensions()) {
           window_size.push_back(dim.size());
@@ -395,15 +405,15 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
             ShapeUtil::MakeShape(operand_element_type, window_size), "window");
         CHECK_EQ(window_index.size(), index.size());
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder_);
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
 
         IrArray::Index input_index(index_type, index.size());
-        llvm::Value* in_bounds = ir_builder_->getInt1(true);
+        llvm::Value* in_bounds = b_->getInt1(true);
         for (size_t i = 0; i < index.size(); ++i) {
-          llvm::Value* stridden_index = ir_builder_->CreateNSWMul(
+          llvm::Value* stridden_index = b_->CreateNSWMul(
               index[i], index_typed_const(window.dimensions(i).stride()));
-          input_index[i] = ir_builder_->CreateNSWSub(
-              ir_builder_->CreateNSWAdd(stridden_index, window_index[i]),
+          input_index[i] = b_->CreateNSWSub(
+              b_->CreateNSWAdd(stridden_index, window_index[i]),
               index_typed_const(window.dimensions(i).padding_low()));
 
           // We must check whether 0 ≤ input_index[i] < bound, as otherwise
@@ -411,16 +421,16 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           // comparison is equivalent to the unsigned comparison
           // input_index[i] < bound, as a negative value wraps to a large
           // positive value.
-          in_bounds = ir_builder_->CreateAnd(
+          in_bounds = b_->CreateAnd(
               in_bounds,
-              ir_builder_->CreateICmpULT(
+              b_->CreateICmpULT(
                   input_index[i],
                   index_typed_const(operand->shape().dimensions(i))));
         }
 
         llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+        SetToFirstInsertPoint(if_data.true_block, b_);
 
         // We are not in pad, so do the computation.
         TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
@@ -428,26 +438,26 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         TF_ASSIGN_OR_RETURN(
             llvm::Value * accum_value,
             compute_nested_(*hlo->to_apply(),
-                            {ir_builder_->CreateLoad(accum_ptr), input_value}));
-        ir_builder_->CreateStore(accum_value, accum_ptr);
+                            {b_->CreateLoad(accum_ptr), input_value}));
+        b_->CreateStore(accum_value, accum_ptr);
 
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder_);
-        return ir_builder_->CreateLoad(accum_ptr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
+        return b_->CreateLoad(accum_ptr);
       };
     case HloOpcode::kReduce:
       return [=, &operand_to_generator](
                  const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
         const HloInstruction* operand = hlo->operand(0);
         llvm::Value* accum_ptr =
-            ir_builder()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+            b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
                 hlo->shape().element_type(), module_));
         llvm::Type* index_type = output_index.GetType();
         TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
                             operand_to_generator.at(hlo->operand(1))(
                                 IrArray::Index(index_type)));
-        ir_builder()->CreateStore(init_value, accum_ptr);
+        b()->CreateStore(init_value, accum_ptr);
 
-        llvm_ir::ForLoopNest loops(IrName(hlo), ir_builder_, index_type);
+        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
         IrArray::Index input_index = loops.AddLoopsForShapeOnDimensions(
             operand->shape(), hlo->dimensions(), "reduction_dim");
         if (!ShapeUtil::IsScalar(hlo->shape())) {
@@ -462,18 +472,17 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
           CHECK_EQ(output_index.size(), j);
         }
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), ir_builder());
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
         TF_ASSIGN_OR_RETURN(
             llvm::Value * input_value,
             operand_to_generator.at(hlo->operand(0))(input_index));
         TF_ASSIGN_OR_RETURN(
             llvm::Value * accum_value,
-            compute_nested_(
-                *hlo->to_apply(),
-                {ir_builder()->CreateLoad(accum_ptr), input_value}));
-        ir_builder()->CreateStore(accum_value, accum_ptr);
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), ir_builder());
-        return ir_builder()->CreateLoad(accum_ptr);
+            compute_nested_(*hlo->to_apply(),
+                            {b()->CreateLoad(accum_ptr), input_value}));
+        b()->CreateStore(accum_value, accum_ptr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
+        return b()->CreateLoad(accum_ptr);
       };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 91f4d960aa62fff3e0699ece37a8c74d7dcf2f59..e3eacef133cb8b615a645ca2f11dd6dedf9f0176 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -43,7 +43,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const HloComputation&, tensorflow::gtl::ArraySlice<llvm::Value*>)>;
 
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
-                        llvm::Module* module, llvm::IRBuilder<>* ir_builder,
+                        llvm::Module* module, llvm::IRBuilder<>* b,
                         NestedComputer compute_nested);
 
   llvm_ir::ElementGenerator MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
index aa360c7f73de2f0f9cf59c22b552b8e60ddb3a87..7f0b030fece8f25578bd90a538279d455350278a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.cc
@@ -14,12 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
+#include "tensorflow/core/framework/allocator.h"
 
 namespace xla {
 namespace gpu {
 
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#device-memory-accesses
-const int64 kCudaMallocAlignBytes = 256;
+// kEntryParameterAlignBytes is equal to EIGEN_MAX_ALIGN_BYTES, though including
+// Eigen headers here to get that symbol may not be a good idea.
+// EIGEN_MAX_ALIGN_BYTES may differ between CUDA-enabled builds vs CUDA-disabled
+// builds and we don't want the IR generated by XLA:GPU to depend on that.
+//
+// TODO(b/111767313): Consider raising EIGEN_MAX_ALIGN_BYTES if it helps.
+const int64 kEntryParameterAlignBytes = 16;
+
+const int64 kXlaAllocatedBufferAlignBytes =
+    tensorflow::Allocator::kAllocatorAlignment;
+
+const int64 kConstantBufferAlignBytes = kXlaAllocatedBufferAlignBytes;
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_constants.h b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
index eb1ca4c6c95a23d2a08f5f9c3cbc85e7d47d4f89..6f5f1fa09c57dfd246d702c0adc92c7e2e76805a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_constants.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_constants.h
@@ -21,9 +21,15 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// Minimum alignment of cudaMalloc.  We require that buffers created by our
-// DeviceMemoryAllocator, and all input/output buffers, have this alignment.
-extern const int64 kCudaMallocAlignBytes;
+// Minimum alignment for buffers passed as incoming arguments by TensorFlow.
+extern const int64 kEntryParameterAlignBytes;
+
+// Minimum alignment for buffers allocated by XLA: the temp buffers and the live
+// out (result) buffers.
+extern const int64 kXlaAllocatedBufferAlignBytes;
+
+// Minimum alignment for constant buffers.
+extern const int64 kConstantBufferAlignBytes;
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index fbc1303085b579e898d2f503a341754109768567..75f414e47fe3edcc1b10b392ed5cc5038be6c190 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -48,80 +48,17 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
 
   TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module));
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow,
-                      HloDataflowAnalysis::Run(*module));
-
-  // Make sure all operands of a library call are in memory instead of constants
-  // in IR. Also, init values of while and conditional nodes cannot be
-  // constants. Insert copies for any constants found at the operands of these
-  // nodes.
-  tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies;
+  // Check the assumption that the epsilon and feature_index constants of the
+  // CUDNN batchnorm op are not shared with other ops where we would replace
+  // them with a copy. These custom op calls are generated with the
+  // CudnnBatchNormRewriter, so this would only happen if HloCSE merges them.
   for (HloComputation* computation : module->computations()) {
     for (HloInstruction* hlo : computation->instructions()) {
-      // Inserts a copy of hlo->operand(n) if it's a constant.
-      auto copy_operand_if_constant = [&](int64 n) -> Status {
-        HloInstruction* operand = hlo->mutable_operand(n);
-        // Skip the operands that have already been replaced with a copy in a
-        // previous iteration (which is possible when a constant is used as an
-        // operand in multiple places).
-        if (ContainsKey(inserted_copies, operand)) {
-          return Status::OK();
-        }
-        for (auto& pair : dataflow->GetInstructionValueSet(operand)) {
-          const HloValueSet& value_set = pair.second;
-          for (const HloValue* value : value_set.values()) {
-            if (value->defining_instruction()->IsConstant() &&
-                !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) {
-              HloInstruction* constant = value->defining_instruction();
-              TF_ASSIGN_OR_RETURN(HloInstruction * copy,
-                                  FindOrInsertCopy(constant));
-              TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy));
-              inserted_copies.insert(copy);
-              changed = true;
-            }
-          }
-        }
-        return Status::OK();
-      };
-
-      if (IsCustomCallToDnnBatchNorm(*hlo)) {
-        // The epsilon and feature_index operands to a CUDNN batchnorm op don't
-        // need to be materialized in memory -- in fact, they must be constants.
-        // These are the last two operands of all three batchnorm ops.
-        for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
-          TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-        }
-      } else if (ImplementedAsLibraryCall(*hlo) ||
-                 hlo->opcode() == HloOpcode::kCrossReplicaSum ||
-                 hlo->opcode() == HloOpcode::kWhile ||
-                 hlo->opcode() == HloOpcode::kConditional) {
-        // For all other library calls, cross-replica-sum, while and conditional
-        // ops materialize all the operands into memory.  (Cross-replica-sum
-        // gets its constant args materialized even if it's not implemented as a
-        // libcall to simplify the implementation.  It's slower, but we can
-        // constant fold away constant args *anyway*, so we just need to make it
-        // work.)
-        for (int64 i = 0; i < hlo->operand_count(); ++i) {
-          TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-        }
+      if (!IsCustomCallToDnnBatchNorm(*hlo)) {
+        continue;
       }
-    }
-  }
-
-  if (changed) {
-    // Check the assumption that the epsilon and feature_index constants of the
-    // CUDNN batchnorm op are not shared with other ops where we would replace
-    // them with a copy. These custom op calls are generated with the
-    // CudnnBatchNormRewriter, so this would only happen if HloCSE merges them.
-    for (HloComputation* computation : module->computations()) {
-      for (HloInstruction* hlo : computation->instructions()) {
-        if (!IsCustomCallToDnnBatchNorm(*hlo)) {
-          continue;
-        }
-        for (int64 i = hlo->operand_count() - 2; i < hlo->operand_count();
-             ++i) {
-          CHECK_EQ(hlo->operand(i)->opcode(), HloOpcode::kConstant);
-        }
+      for (int64 i = hlo->operand_count() - 2; i < hlo->operand_count(); ++i) {
+        CHECK_EQ(hlo->operand(i)->opcode(), HloOpcode::kConstant);
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 0cad2958c72797b4d70f00676928b2b21d7a3e8d..bb71c79fd7646c9d3bad282d8041a9a05aec0485 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -84,7 +85,7 @@ Status GpuExecutable::ExecuteThunks(
   }
 
   // Stream 0 indicates `main_stream` and substreams start from stream 1.
-  std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
+  std::vector<StreamPool::Ptr> sub_streams;
   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
     sub_streams.emplace_back();
@@ -181,6 +182,55 @@ Status GpuExecutable::ExecuteThunks(
   return Status::OK();
 }
 
+StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*>
+GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) {
+  tensorflow::mutex_lock lock(module_handle_mutex_);
+  auto it = module_globals_.find(executor);
+  if (it != module_globals_.end()) {
+    return &it->second;
+  }
+
+  se::MultiModuleLoaderSpec module_spec;
+  if (!cubin().empty()) {
+    module_spec.AddCudaCubinInMemory(cubin());
+  }
+  module_spec.AddCudaPtxInMemory(ptx().c_str());
+
+  tensorflow::gtl::FlatMap<int64, se::DeviceMemoryBase> globals;
+  se::ModuleHandle module_handle;
+  executor->LoadModule(module_spec, &module_handle);
+
+  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
+       ++i) {
+    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+    if (allocation.is_constant()) {
+      TF_ASSIGN_OR_RETURN(
+          se::DeviceMemoryBase global,
+          executor->GetUntypedSymbol(
+              llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
+              module_handle));
+      VLOG(3) << "Resolved global "
+              << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
+              << " to " << global.opaque();
+      InsertOrDie(&globals, i, global);
+
+      const Literal& literal =
+          llvm_ir::LiteralForConstantAllocation(allocation);
+      CHECK(ShapeUtil::IsArray(literal.shape()));
+      if (!ShouldEmitLiteralInLlvmIr(literal)) {
+        VLOG(3) << "H2D memcpy for constant with shape "
+                << ShapeUtil::HumanString(literal.shape());
+        TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D(
+            literal.untyped_data(), allocation.size(), &global));
+      }
+    }
+  }
+
+  module_handles_.emplace(executor,
+                          se::ScopedModuleHandle(executor, module_handle));
+  return &module_globals_.emplace(executor, std::move(globals)).first->second;
+}
+
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
@@ -192,6 +242,10 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
   }
 
   BufferAllocations::Builder buffer_allocations_builder;
+  se::StreamExecutor* executor = run_options->stream()->parent();
+
+  TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor));
+
   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
        ++i) {
     const BufferAllocation& allocation = assignment_->GetAllocation(i);
@@ -213,8 +267,12 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
 
       buffer_allocations_builder.RegisterBuffer(i, buffer);
     }
+
+    if (allocation.is_constant()) {
+      buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i));
+    }
   }
-  se::StreamExecutor* executor = run_options->stream()->parent();
+
   TF_ASSIGN_OR_RETURN(
       auto buffer_allocations,
       buffer_allocations_builder.Build(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 80ec38c3ac114fe4ad9d56784330c1144d913db1..c7ce6d0acbbbe594040271c0d45c71c016e36514 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -66,7 +68,7 @@ class GpuExecutable : public Executable {
   }
 
   // Returns the compiled PTX for the computation.
-  tensorflow::StringPiece ptx() const { return ptx_; }
+  const string& ptx() const { return ptx_; }
 
   // Returns the cubin (compiled PTX) stored in this GpuExecutable.  May be
   // empty, in which case compilation is left up to the GPU driver.
@@ -98,6 +100,15 @@ class GpuExecutable : public Executable {
   // computation. Uses points-to analysis from buffer assignment.
   const PointsToSet& GetRootPointsToSet() const;
 
+  using BufferAllocToDeviceMemoryMap =
+      tensorflow::gtl::FlatMap<BufferAllocation::Index, se::DeviceMemoryBase>;
+
+  // Loads the PTX or CUBIN for this executable into `executor` and resolves the
+  // globals corresponding to constant buffers.  Returns a map mapping buffer
+  // allocation indices to GPU pointers.
+  StatusOr<const BufferAllocToDeviceMemoryMap*> ResolveConstantGlobals(
+      stream_executor::StreamExecutor* executor);
+
   // The LLVM IR, in string format, of the unoptimized module generated for this
   // GpuExecutable. We save a string instead of an llvm::Module* because leaving
   // llvm::Module* in a singleton can cause the heap checker to emit false
@@ -126,6 +137,14 @@ class GpuExecutable : public Executable {
   // memory for every output/temp buffers.
   const std::unique_ptr<const BufferAssignment> assignment_;
 
+  // Cache of module handles and constant buffer allocation maps used by
+  // `ResolveConstantGlobals`.
+  tensorflow::mutex module_handle_mutex_;
+  std::map<stream_executor::StreamExecutor*, se::ScopedModuleHandle>
+      module_handles_ GUARDED_BY(module_handle_mutex_);
+  std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
+      module_globals_ GUARDED_BY(module_handle_mutex_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 09ef62c87f8875a5803497e8eb628769f883202a..cc67804d565af31403ae10d473cdc57f7ca802bc 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -31,8 +31,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using stream_executor::dnn::DataLayout;
-using stream_executor::dnn::FilterLayout;
+using se::dnn::DataLayout;
+using se::dnn::FilterLayout;
 
 static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) {
   int major, minor;
@@ -44,7 +44,7 @@ static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) {
 // Returns (input, filter, output) layouts.
 static std::tuple<DataLayout, FilterLayout, DataLayout>
 HeuristicLayoutAssignment(const HloInstruction* instr,
-                          stream_executor::StreamExecutor* stream_executor) {
+                          se::StreamExecutor* stream_executor) {
   // DataLayout and FilterLayout uses weird enum names. Translations:
   //   N <=> Batch or Output
   //   C <=> Depth or Input
@@ -52,31 +52,44 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   //   W <=> X
   //
   // Therefore kOutputInputYX and kBatchDepthYX mean NCHW.
+  //
+  // If you have trouble keeping these straight, consider that all that matters
+  // is the location of the channel dim: Is it major (NCHW), or minor (NHWC)?
+
+  constexpr auto kAllNCHW =
+      std::make_tuple(DataLayout::kBatchDepthYX, FilterLayout::kOutputInputYX,
+                      DataLayout::kBatchDepthYX);
+  constexpr auto kAllNHWC =
+      std::make_tuple(DataLayout::kBatchYXDepth, FilterLayout::kOutputYXInput,
+                      DataLayout::kBatchYXDepth);
 
-  // As of today, our empirical evidence is that cudnn 7.0 is faster on V100 x
-  // fp16 with the mostly-NHWC layout. The heuristic may change as cudnn version
-  // changes, as well as the hardware updates.
+  // If we're not Volta or not fp16, the decision is easy: Use NCHW.
   if (!(instr->operand(0)->shape().element_type() == xla::PrimitiveType::F16 &&
         IsVoltaOrLater(*stream_executor))) {
-    return std::make_tuple(DataLayout::kBatchDepthYX,
-                           FilterLayout::kOutputInputYX,
-                           DataLayout::kBatchDepthYX);
+    return kAllNCHW;
   }
+
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
-  // For BackwardInput that has stride, full NHWC layouts run significantly
-  // slower than (NHWC, NCHW, NCHW) or (NHWC, NCHW, NHWC).
+
+  // Empirically we've found with Volta and cudnn 7 that backward-input convs
+  // with stride are significantly faster with NCHW layouts.
   //
-  // TODO(timshen): more closely compare (NHWC, NCHW, NCHW) and (NHWC, NCHW,
-  // NHWC).
+  // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
+  // which on paper gives good performance. However, there are two observations:
+  // * a mixed layout combination is more cuDNN-bug prone, based on empirical
+  //   envidence.
+  // * we've also observed that for mixed layouts, cuDNN transposes data back
+  //   and forth from a different layout combination. If we end up with
+  //   transposes anyway, we prefer to have them in XLA, as they can be fused.
+  // TODO(timshen): Figure out the exact condition. This may be achieved by
+  // auto-tuning layouts offline.
   if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
       window_util::HasStride(instr->window())) {
-    return std::make_tuple(DataLayout::kBatchYXDepth,
-                           FilterLayout::kOutputInputYX,
-                           DataLayout::kBatchDepthYX);
+    return kAllNCHW;
   }
-  return std::make_tuple(DataLayout::kBatchYXDepth,
-                         FilterLayout::kOutputYXInput,
-                         DataLayout::kBatchYXDepth);
+
+  // For other Volta f16 convolutions, use NHWC.
+  return kAllNHWC;
 }
 
 // Adds layout constraints on the cudnn custom-call instruction. The layout
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
index 19420e590d05892417da4d5e62fdcde5eba9d9f1..17226769302eef0dd01550b0bc5404e889ad78f8 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -37,10 +37,9 @@ void InitAndStartTimer(std::stack<std::unique_ptr<se::Timer>>* timers,
   stream->InitTimer(timers->top().get()).ThenStartTimer(timers->top().get());
 }
 
-uint64 GetCyclesTaken(
-    std::stack<std::unique_ptr<se::Timer>>* timers,
-    const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
-    se::Stream* stream, double clock_rate_ghz) {
+uint64 GetCyclesTaken(std::stack<std::unique_ptr<se::Timer>>* timers,
+                      const std::vector<StreamPool::Ptr>& sub_streams,
+                      se::Stream* stream, double clock_rate_ghz) {
   CHECK_GT(timers->size(), 0);
   stream->ThenWaitFor(&sub_streams);
   stream->ThenStopTimer(timers->top().get());
@@ -53,7 +52,7 @@ uint64 GetCyclesTaken(
 
 HloExecutionProfiler::HloExecutionProfiler(
     bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
-    const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
+    const std::vector<StreamPool::Ptr>& sub_streams,
     const HloComputation* computation)
     : do_profile_(do_profile),
       profile_(profile),
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
index 6654850bef3efa46028defbba81e3537fafbf143..80cde75f2bbb555f514fffea58ad92edf92fd0d1 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
@@ -38,10 +38,10 @@ class ScopedInstructionProfiler;
 class HloExecutionProfiler {
  public:
   // If profiling is enabled, start an execution timer running.
-  explicit HloExecutionProfiler(
-      bool do_profile, HloExecutionProfile* profile, se::Stream* stream,
-      const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams,
-      const HloComputation* computation);
+  explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
+                                se::Stream* stream,
+                                const std::vector<StreamPool::Ptr>& sub_streams,
+                                const HloComputation* computation);
 
   // If profiling is enabled, sets the total cycle count on the profile from the
   // execution timer.
@@ -72,7 +72,7 @@ class HloExecutionProfiler {
   double clock_rate_ghz_;
   HloExecutionProfile* profile_;
   se::Stream* stream_;
-  const std::vector<Pool<se::Stream>::SmartPtr>& sub_streams_;
+  const std::vector<StreamPool::Ptr>& sub_streams_;
   const HloComputation* computation_;
   std::stack<std::unique_ptr<se::Timer>> timers_;
   // Contains the HLO instructions for which we are currently measuring the
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 6f2a7e1850a55272dbe335427cf82133bfae0b49..8c11cd05419289d82b033c936bb60884f45cb636 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -18,8 +18,10 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -39,7 +41,7 @@ void HloToIrBindings::EmitBasePointersForHlos(
   // I/O HLOs are bound to the arguments of the current IR function. I.e.,
   //
   // void IrFunction(io_0, io_1, ..., io_{m-1}, temp_buffer_base) {
-  llvm::Function* function = ir_builder_->GetInsertBlock()->getParent();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
   CHECK_EQ(io_hlos.size() + 1, function->arg_size());
 
   // An HLO can have duplicated operands. This data structure remembers which
@@ -79,8 +81,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
         const int64 offset = slice.offset();
         CHECK_NE(nullptr, temp_buffer_base_);
         // Emit IR for GetTupleElement instruction and bind to emitted value.
-        llvm::Value* base_ptr = ir_builder_->CreateInBoundsGEP(
-            temp_buffer_base_, ir_builder_->getInt64(offset));
+        llvm::Value* base_ptr =
+            b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset));
         BindHloToIrValue(*non_io_hlo,
                          EmitGetTupleElement(non_io_hlo, base_ptr));
       }
@@ -108,15 +110,20 @@ void HloToIrBindings::EmitBasePointersForHlos(
           if (slice.allocation()->is_thread_local()) {
             llvm::Type* pointee_type =
                 llvm_ir::ShapeToIrType(non_io_hlo->shape(), module_);
-            BindHloToIrValue(*non_io_hlo,
-                             ir_builder_->CreateAlloca(pointee_type), index);
+            BindHloToIrValue(*non_io_hlo, b_->CreateAlloca(pointee_type),
+                             index);
+          } else if (slice.allocation()->is_constant()) {
+            llvm::Value* global_for_constant =
+                module_->getGlobalVariable(llvm_ir::AsStringRef(
+                    llvm_ir::ConstantBufferAllocationToGlobalName(
+                        *slice.allocation())));
+            BindHloToIrValue(*non_io_hlo, global_for_constant);
           } else {
             const int64 offset = slice.offset();
             CHECK_NE(nullptr, temp_buffer_base_);
             BindHloToIrValue(
                 *non_io_hlo,
-                ir_builder_->CreateInBoundsGEP(temp_buffer_base_,
-                                               ir_builder_->getInt64(offset)),
+                b_->CreateInBoundsGEP(temp_buffer_base_, b_->getInt64(offset)),
                 index);
           }
         });
@@ -129,11 +136,19 @@ llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
   if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
     return llvm_ir::EmitGetTupleElement(
         gte->shape(), gte->tuple_index(), /*alignment=*/1,
-        GetTypedIrValue(*gte->operand(0), {}, base_ptr), ir_builder_, module_);
+        GetTypedIrValue(*gte->operand(0), {}, base_ptr), b_, module_);
   }
   return llvm_ir::EmitGetTupleElement(
       gte->shape(), gte->tuple_index(), /*alignment=*/1,
-      EmitGetTupleElement(gte->operand(0), base_ptr), ir_builder_, module_);
+      EmitGetTupleElement(gte->operand(0), base_ptr), b_, module_);
+}
+
+// Returns true if `value` has a name that should not be changed.
+static bool HasMeaningfulName(llvm::Value* value) {
+  if (auto* global = llvm::dyn_cast<llvm::GlobalValue>(value)) {
+    return global->getLinkage() != llvm::GlobalValue::PrivateLinkage;
+  }
+  return false;
 }
 
 llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
@@ -148,11 +163,15 @@ llvm::Value* HloToIrBindings::GetTypedIrValue(const HloInstruction& hlo,
     typed_ir_value = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
         llvm::cast<llvm::GlobalVariable>(ir_value), dest_type);
   } else {
-    typed_ir_value =
-        ir_builder_->CreateBitCast(ir_value, pointee_type->getPointerTo());
+    typed_ir_value = b_->CreateBitCast(ir_value, pointee_type->getPointerTo());
+  }
+  if (!HasMeaningfulName(ir_value)) {
+    ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw")));
+  }
+  if (!HasMeaningfulName(typed_ir_value)) {
+    typed_ir_value->setName(
+        llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed")));
   }
-  ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "raw")));
-  typed_ir_value->setName(llvm_ir::AsStringRef(llvm_ir::IrName(&hlo, "typed")));
   return typed_ir_value;
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
index a86e6e78c693ac53bb2c70d88b999a4e1273ecad..eee40b0e91fc03013a6978ae3cfe42b87633eed7 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.h
@@ -36,14 +36,13 @@ class HloToIrBindings {
  public:
   HloToIrBindings(const HloModule& module,
                   const BufferAssignment* buffer_assignment,
-                  llvm::IRBuilder<>* ir_builder, llvm::Module* llvm_module,
+                  llvm::IRBuilder<>* b, llvm::Module* llvm_module,
                   bool is_nested)
       : buffer_assignment_(buffer_assignment),
         is_nested_(is_nested),
-        ir_builder_(ir_builder),
+        b_(b),
         module_(llvm_module),
-        alias_analysis_(module, *buffer_assignment_,
-                        &ir_builder_->getContext()) {}
+        alias_analysis_(module, *buffer_assignment_, &b_->getContext()) {}
 
   void EmitBasePointersForHlos(
       tensorflow::gtl::ArraySlice<const HloInstruction*> io_hlos,
@@ -104,7 +103,7 @@ class HloToIrBindings {
 
   const bool is_nested_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* module_;
 
   // Stores the underlying llvm::IrArray for each HloInstruction.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 64ed3d748febd8281a8e602194b31c937a4a682a..0f2c83aeb2633a007559d8caac78ea2d233539ed 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -73,6 +73,67 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
   }
 }
 
+// This function limits the maximum number of operands to a fusion.
+//
+// There's a cap on how many parameters we can pass to a CUDA kernel, but
+// exactly what that limit is is hazy, as it depends on (among other things) how
+// much GPU constant memory is in use for other purposes.
+//
+// Moreover, we don't even know at the point that we're running fusion how many
+// arguments the CUDA kernel for a fusion node will have: It depends on buffer
+// assignment, where we will decide which of the fusion's operands live in XLA's
+// big temp buffer versus in other allocations.
+//
+// As a heuristic, we simply cap the number of fusion operands plus outputs at
+// kMaxOperandsAndOutputsPerFusion.  This puts an upper bound on the number of
+// parameters to the kernel, working around the correctness problem.
+//
+// This limit is also often good for performance.  In a fusion with many
+// operands, each GPU thread likely has to do a lot of work, and so possibly
+// uses a lot of registers, thus limiting occupancy.
+/*static*/ bool GpuInstructionFusion::FusionWouldBeTooLarge(
+    const HloInstruction* a, const HloInstruction* b) {
+  // Compute the number of outputs of the (possibly multi-output) fusion node
+  // we're considering creating.
+  //
+  // This isn't precise; we may be off by one if
+  //  - We're creating a multi-output fusion out of two non-MOFs.  Creating a
+  //    MOF adds a new buffer, namely, the tuple buffer.
+  //  - We're merging two MOFs.  In this case, we should count the tuple buffer
+  //    only once.
+  //  - WLOG there's an edge from `a` to `b` and `b` is the only consumer of
+  //    `a`.  In this case the result of `a` is not part of the output of the
+  //    fusion.
+  //
+  // But because this is a heuristic and our limit
+  // kMaxOperandsAndOutputsPerFusion is a large value (so +/- 1 doesn't make a
+  // big difference), we ignore this small inaccuracy in favor of simplicity.
+  int64 num_output_buffers = ShapeUtil::SubshapeCount(a->shape()) +
+                             ShapeUtil::SubshapeCount(b->shape());
+
+  // The new fusion will have no more operands and outputs than
+  //   producer_operands + consumer_operands - 1 + num_output_buffers
+  // (minus one because we may be fusing a producer->consumer edge between `a`
+  // and `b`).
+  //
+  // This fact may be enough to let us avoid having to compute the true total
+  // number of operands, which can be expensive.
+  if (a->operand_count() + b->operand_count() - 1 + num_output_buffers <=
+      kMaxOperandsAndOutputsPerFusion) {
+    return false;
+  }
+
+  // Compute the precise number of operands to the new fusion.
+  tensorflow::gtl::FlatSet<const HloInstruction*> operands(
+      a->operands().begin(), a->operands().end());
+  operands.insert(b->operands().begin(), b->operands().end());
+  // If there's an edge between `a` and `b`, don't count it: We're fusing that
+  // producer -> consumer relationship.
+  operands.erase(a);
+  operands.erase(b);
+  return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
+}
+
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
@@ -141,6 +202,7 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
              IsIEEEFloatingPointScalarConstant(producer->operand(0)) &&
              fused_parameter_users[0]->opcode() == HloOpcode::kMultiply;
     }
+    return false;
   }
 
   // Other output fusions are not currently supported on GPUs.
@@ -183,8 +245,13 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return true;
   }
 
-  return IsFusile(*producer) && IsFusile(*consumer) &&
-         InstructionFusion::ShouldFuse(consumer, operand_index);
+  if (!IsFusile(*producer) || !IsFusile(*consumer) ||
+      !InstructionFusion::ShouldFuse(consumer, operand_index)) {
+    return false;
+  }
+
+  // We put this check last because it's potentially expensive.
+  return !FusionWouldBeTooLarge(consumer, producer);
 }
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index f629d9ff2c7165b652369612c30979150f93bd24..c91f6343a69268ca687004dbe0ffbb863271a95c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -27,6 +27,19 @@ class GpuInstructionFusion : public InstructionFusion {
   explicit GpuInstructionFusion(bool may_duplicate)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
+  // Maximum number of operands plus outputs allowed on a single fusion node.
+  // Exposed publicly mainly for tests.
+  static constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
+
+  // Determines whether the combination of `a` and `b` into a (possibly
+  // multi-output) fusion would be "too large" -- i.e., have more operands and
+  // outputs than is allowed.
+  //
+  // `ShouldFuse` and `ShouldFuseIntoMultiOutput` call this; it's public so that
+  // other fusion passes (e.g. GPU multi-output fusion) can also call this.
+  static bool FusionWouldBeTooLarge(const HloInstruction* a,
+                                    const HloInstruction* b);
+
   static bool IsExpensive(const HloInstruction& instruction);
 
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 98ba162cd97b8e214d7f055ee9dd590d7c67e1dd..8d0522bd8fd6659e64d18c52807df8dc7fc2f3b8 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -606,5 +606,35 @@ TEST_F(InstructionFusionTest, FuseScalarConstant) {
                       op::Parameter()));
 }
 
+// Check that we limit the number of operands to fusions we create.
+TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
+  constexpr int64 kNumParams = 200;
+  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+
+  // Compute p0 + p1 + ... + pN.
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
+  auto param0 =
+      b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p"));
+  auto sum = param0;
+  for (int64 i = 1; i < kNumParams; ++i) {
+    auto param =
+        b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p"));
+    sum = b.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
+  }
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  for (const HloInstruction* instr : computation->instructions()) {
+    EXPECT_LE(instr->operand_count(),
+              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+        << instr->ToString();
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 2799baab41ce15e959be16102b534aac0c8f345a..6352b330d17d77da65ed4ffb5a225535ff6caf82 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -81,11 +81,6 @@ bool DotImplementedAsGemm(const HloInstruction& dot) {
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
-  // We can only do this if the HLO is unnested.
-  if (hlo.parent() != hlo.GetModule()->entry_computation()) {
-    return false;
-  }
-
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
     return DotImplementedAsGemm(hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 9bb4c42b15016a763889a903593cb987afda8da9..5d23a3d01842c7b4ff405171cd49c96a19f7e5b0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -31,6 +31,12 @@ namespace gpu {
 constexpr int64 kWarpSize = 32;
 
 // Returns true if `hlo` will be implemented as a call to BLAS gemm.
+//
+// Precondition: `hlo` is in an "unnested context", meaning, it lives within the
+// entry computation, within the either of a while loop's subcomputations,
+// within any of a conditional's subcomputations, etc., but *does not* live
+// within a reduce subcomputation, a map subcomputation, a fusion
+// subcomputation, etc.  It's OK if `hlo` *is* a fusion.
 bool ImplementedAsGemm(const HloInstruction& hlo);
 
 // A call to cuDNN for batch normalization is represented as CustomCall HLO with
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index a08b72e3afb039354554c8d1f25e71d42c8a854e..1295e83c0c4c16a1a18eaaadbafb5fd226be6eff 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
-#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
@@ -58,12 +57,12 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                      IrEmitterContext* ir_emitter_context, bool is_nested)
     : ir_emitter_context_(ir_emitter_context),
       module_(ir_emitter_context->llvm_module()),
-      ir_builder_(module_->getContext()),
+      b_(module_->getContext()),
       bindings_(ir_emitter_context->hlo_module(),
-                &ir_emitter_context->buffer_assignment(), &ir_builder_, module_,
+                &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
+  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config.debug_options()
           .xla_enable_fast_math()));
 }
@@ -72,30 +71,16 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
   ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
   for (const HloInstruction* operand : hlo->operands()) {
     operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand, *hlo)
-          .EmitReadArrayElement(index, &ir_builder_);
+      return GetIrArray(*operand, *hlo).EmitReadArrayElement(index, &b_);
     };
   }
   return EmitTargetElementLoop(
-      *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_,
+      *hlo, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
                                   GetNestedComputer())
                 .MakeElementGenerator(hlo, operand_to_generator));
 }
 
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
-  const Literal& literal = constant->literal();
-  llvm::Constant* initializer =
-      llvm_ir::ConvertLiteralToIrConstant(literal, module_);
-  llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-      *module_, initializer->getType(),
-      /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer,
-      /*Name=*/"");
-  VLOG(2) << "HandleConstant: " << constant->ToString() << std::endl
-          << "  emitted_value: " << llvm_ir::DumpToString(*global_for_const)
-          << std::endl
-          << "  its type: "
-          << llvm_ir::DumpToString(*global_for_const->getType());
-  bindings_.BindHloToIrValue(*constant, global_for_const);
   return Status::OK();
 }
 
@@ -120,139 +105,7 @@ Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
           get_tuple_element->shape(), get_tuple_element->tuple_index(),
           // TODO(b/26344050): tighten the alignment here
           // based on the real element type.
-          /*alignment=*/1, GetBasePointer(*operand), &ir_builder_, module_));
-  return Status::OK();
-}
-
-Status IrEmitter::HandleSort(HloInstruction* sort) {
-  auto keys = sort->operand(0);
-  auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
-  if (values != nullptr) {
-    // TODO(b/26783907): Also sort the values by their corresponding key.
-    return Unimplemented("Key/Value Sort is not implemented on GPU");
-  }
-  int dimension_to_sort = sort->dimensions(0);
-  const llvm_ir::IrArray& keys_array = GetIrArray(*keys, *sort);
-  const llvm_ir::IrArray& target_array = GetIrArray(*sort, *sort);
-
-  const Shape& keys_shape = keys->shape();
-
-  // TODO(b/26783907): This case can probably be avoided with the Algebraic
-  // Simplifier.
-  if (ShapeUtil::IsScalar(keys_shape)) {
-    return Status::OK();
-  }
-
-  // Create loop nests which loop through the operand dimensions. The sort
-  // dimension is handled in three separate innermost loops which perform the
-  // sorting.
-  llvm_ir::ForLoopNest loop_nest(IrName(sort), &ir_builder_);
-  llvm_ir::IrArray::Index keys_index = EmitOperandArrayLoopNest(
-      keys_array, dimension_to_sort, "keys", &loop_nest);
-
-  // 'compare_keys_index' is the index of the element that 'keys_index' should
-  // be compared to.
-  llvm_ir::IrArray::Index compare_keys_index(keys_index.GetType());
-  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
-    if (dimension != dimension_to_sort) {
-      compare_keys_index.push_back(keys_index[dimension]);
-    } else {
-      compare_keys_index.push_back(nullptr);
-    }
-  }
-
-  // Create the sorting loops which do the sorting.
-  int64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
-  std::unique_ptr<llvm_ir::ForLoop> stages_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/
-      tensorflow::Log2Ceiling64(dimension_to_sort_bound),
-      /*suffix=*/"sort_stages");
-  std::unique_ptr<llvm_ir::ForLoop> mask_loop = loop_nest.AddLoop(
-      /*suffix=*/"mask",
-      /*start_index=*/keys_index.GetConstantWithIndexType(0),
-      /*end_index=*/stages_loop->GetIndVarValue());
-  std::unique_ptr<llvm_ir::ForLoop> compare_loop = loop_nest.AddLoop(
-      /*start_index=*/0,
-      /*end_index=*/dimension_to_sort_bound,
-      /*suffix=*/"compare");
-
-  // Naive C++ code for the inner loops (without parallelization):
-  //
-  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
-  //     ++stage) {
-  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
-  //   for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //     int64 j = i ^ first_xor_mask;
-  //     if (i < j && j < dimension_to_sort_bound) {
-  //       int64 min_key = std::min(keys[i], keys[j]);
-  //       keys[j] = std::max(keys[i], keys[j]);
-  //       keys[i] = min_key;
-  //     }
-  //   }
-  //   for (int64 mask = 0; mask < stage; ++mask) {
-  //     int64 later_xor_mask = (1LL << (stage - (mask + 1));
-  //     for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
-  //       int64 j = i ^ later_xor_mask;
-  //       if (i < j && j < dimension_to_sort_bound) {
-  //         int64 min_key = std::min(keys[i], keys[j]);
-  //         keys[j] = std::max(keys[i], keys[j]);
-  //         keys[i] = min_key;
-  //       }
-  //     }
-  //   }
-  // }
-  //
-  // This follows the algorithm described on Wikipedia:
-  // https://en.wikipedia.org/wiki/Bitonic_sorter
-
-  SetToFirstInsertPoint(stages_loop->GetBodyBasicBlock(), &ir_builder_);
-  // The first xor mask of a stage is 2^(stage + 1) - 1.
-  auto first_xor_mask = ir_builder_.CreateSub(
-      ir_builder_.CreateShl(
-          keys_index.GetConstantWithIndexType(1),
-          ir_builder_.CreateAdd(stages_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))),
-      keys_index.GetConstantWithIndexType(1));
-  std::unique_ptr<llvm_ir::ForLoop> first_compare_loop =
-      llvm_ir::ForLoop::EmitForLoop(
-          /*prefix=*/"first_compare",
-          /*start_index=*/keys_index.GetConstantWithIndexType(0),
-          /*end_index=*/
-          keys_index.GetConstantWithIndexType(
-              keys_shape.dimensions(dimension_to_sort)),
-          /*step=*/keys_index.GetConstantWithIndexType(1),
-          /*ir_builder=*/&ir_builder_);
-
-  SetToFirstInsertPoint(first_compare_loop->GetBodyBasicBlock(), &ir_builder_);
-  // 'first_compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = first_compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] = ir_builder_.CreateXor(
-      first_compare_loop->GetIndVarValue(), first_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
-                  target_array);
-
-  SetToFirstInsertPoint(compare_loop->GetPreheaderBasicBlock(), &ir_builder_);
-  // The later masks of a stage are 2^(stage - (mask_loop_ind_var + 1)).
-  auto later_xor_mask = ir_builder_.CreateShl(
-      keys_index.GetConstantWithIndexType(1),
-      ir_builder_.CreateSub(
-          stages_loop->GetIndVarValue(),
-          ir_builder_.CreateAdd(mask_loop->GetIndVarValue(),
-                                keys_index.GetConstantWithIndexType(1))));
-
-  SetToFirstInsertPoint(compare_loop->GetBodyBasicBlock(), &ir_builder_);
-  // 'compare_loop' iterates through the 'dimension_to_sort'.
-  keys_index[dimension_to_sort] = compare_loop->GetIndVarValue();
-  compare_keys_index[dimension_to_sort] =
-      ir_builder_.CreateXor(compare_loop->GetIndVarValue(), later_xor_mask);
-  EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
-                  target_array);
-
-  // Set the IR builder insert point to the exit basic block of the outer most
-  // loop. This ensures later instructions are inserted after this loop nest.
-  ir_builder_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
-
+          /*alignment=*/1, GetBasePointer(*operand), &b_, module_));
   return Status::OK();
 }
 
@@ -277,8 +130,7 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   for (const HloInstruction* operand : tuple->operands()) {
     base_ptrs.push_back(GetBasePointer(*operand));
   }
-  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &ir_builder_,
-                     module_);
+  llvm_ir::EmitTuple(GetIrArray(*tuple, *tuple), base_ptrs, &b_, module_);
   return Status::OK();
 }
 
@@ -299,7 +151,7 @@ Status IrEmitter::EmitCallToNestedComputation(
   std::vector<llvm::Value*> arguments(operands.begin(), operands.end());
   arguments.push_back(output);
   arguments.push_back(bindings_.GetTempBufferBase());
-  ir_builder_.CreateCall(emitted_function, arguments);
+  b_.CreateCall(emitted_function, arguments);
 
   return Status::OK();
 }
@@ -321,21 +173,20 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
       computation.root_instruction()->shape().element_type();
   bool is_atomic_integral = element_type == S32 || element_type == U32 ||
                             element_type == S64 || element_type == U64;
-  llvm::Value* source = ir_builder_.CreateLoad(source_address, "source");
+  llvm::Value* source = b_.CreateLoad(source_address, "source");
   if (root_opcode == HloOpcode::kAdd) {
     // NVPTX supports atomicAdd on F32 and integer types.
     if (element_type == F32) {
       // F32 + F32
       llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_atomic_load_add_f32,
                                    {output_address, source},
-                                   {output_address->getType()}, &ir_builder_);
+                                   {output_address->getType()}, &b_);
       return true;
     }
     if (is_atomic_integral) {
       // integral + integral
-      ir_builder_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address,
-                                  source,
-                                  llvm::AtomicOrdering::SequentiallyConsistent);
+      b_.CreateAtomicRMW(llvm::AtomicRMWInst::Add, output_address, source,
+                         llvm::AtomicOrdering::SequentiallyConsistent);
       return true;
     }
   }
@@ -346,8 +197,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     auto opcode = primitive_util::IsSignedIntegralType(element_type)
                       ? llvm::AtomicRMWInst::Max
                       : llvm::AtomicRMWInst::UMax;
-    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+    b_.CreateAtomicRMW(opcode, output_address, source,
+                       llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
 
@@ -356,8 +207,8 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     auto opcode = primitive_util::IsSignedIntegralType(element_type)
                       ? llvm::AtomicRMWInst::Min
                       : llvm::AtomicRMWInst::UMin;
-    ir_builder_.CreateAtomicRMW(opcode, output_address, source,
-                                llvm::AtomicOrdering::SequentiallyConsistent);
+    b_.CreateAtomicRMW(opcode, output_address, source,
+                       llvm::AtomicOrdering::SequentiallyConsistent);
     return true;
   }
 
@@ -429,20 +280,20 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
   llvm::Type* element_address_type = element_type->getPointerTo();
 
   int atomic_size = (element_size < 32) ? 32 : element_size;
-  llvm::Type* atomic_type = ir_builder_.getIntNTy(atomic_size);
+  llvm::Type* atomic_type = b_.getIntNTy(atomic_size);
   llvm::Type* atomic_address_type =
       atomic_type->getPointerTo(output_address_type->getPointerAddressSpace());
 
   // cas_old_output_address and cas_new_output_address point to the scratch
   // memory where we store the old and new values for the repeated atomicCAS
   // operations.
-  llvm::Value* cas_old_output_address = ir_builder_.CreateAlloca(
+  llvm::Value* cas_old_output_address = b_.CreateAlloca(
       atomic_type, /*ArraySize=*/nullptr, "cas_old_output_address");
-  llvm::Value* cas_new_output_address = ir_builder_.CreateAlloca(
+  llvm::Value* cas_new_output_address = b_.CreateAlloca(
       atomic_type, /*ArraySize=*/nullptr, "cas_new_output_address");
 
   // Emit preparation code to the preheader.
-  llvm::BasicBlock* loop_preheader_bb = ir_builder_.GetInsertBlock();
+  llvm::BasicBlock* loop_preheader_bb = b_.GetInsertBlock();
 
   llvm::Value* atomic_memory_address;
   // binop_output_address points to the scratch memory that stores the
@@ -453,118 +304,74 @@ Status IrEmitter::EmitAtomicOperationUsingCAS(const HloComputation& computation,
     CHECK_EQ((element_size % sizeof(char)), 0);
     llvm::Type* address_int_type =
         module_->getDataLayout().getIntPtrType(output_address_type);
-    atomic_memory_address =
-        ir_builder_.CreatePtrToInt(output_address, address_int_type);
+    atomic_memory_address = b_.CreatePtrToInt(output_address, address_int_type);
     llvm::Value* mask = llvm::ConstantInt::get(address_int_type, 3);
-    llvm::Value* offset = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    llvm::Value* offset = b_.CreateAnd(atomic_memory_address, mask);
     mask = llvm::ConstantInt::get(address_int_type, -4);
-    atomic_memory_address = ir_builder_.CreateAnd(atomic_memory_address, mask);
+    atomic_memory_address = b_.CreateAnd(atomic_memory_address, mask);
     atomic_memory_address =
-        ir_builder_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
-    binop_output_address = ir_builder_.CreateAdd(
-        ir_builder_.CreatePtrToInt(cas_new_output_address, address_int_type),
-        offset);
+        b_.CreateIntToPtr(atomic_memory_address, atomic_address_type);
+    binop_output_address = b_.CreateAdd(
+        b_.CreatePtrToInt(cas_new_output_address, address_int_type), offset);
     binop_output_address =
-        ir_builder_.CreateIntToPtr(binop_output_address, element_address_type);
+        b_.CreateIntToPtr(binop_output_address, element_address_type);
   } else {
     atomic_memory_address =
-        ir_builder_.CreateBitCast(output_address, atomic_address_type);
+        b_.CreateBitCast(output_address, atomic_address_type);
     binop_output_address =
-        ir_builder_.CreateBitCast(cas_new_output_address, element_address_type);
+        b_.CreateBitCast(cas_new_output_address, element_address_type);
   }
 
   // Use the value from the memory that atomicCAS operates on to initialize
   // cas_old_output.
   llvm::Value* cas_old_output =
-      ir_builder_.CreateLoad(atomic_memory_address, "cas_old_output");
-  ir_builder_.CreateStore(cas_old_output, cas_old_output_address);
+      b_.CreateLoad(atomic_memory_address, "cas_old_output");
+  b_.CreateStore(cas_old_output, cas_old_output_address);
 
   llvm::BasicBlock* loop_exit_bb = loop_preheader_bb->splitBasicBlock(
-      ir_builder_.GetInsertPoint(), "atomic_op_loop_exit");
-  llvm::BasicBlock* loop_body_bb =
-      llvm::BasicBlock::Create(ir_builder_.getContext(), "atomic_op_loop_body",
-                               ir_builder_.GetInsertBlock()->getParent());
-  ir_builder_.SetInsertPoint(loop_body_bb);
+      b_.GetInsertPoint(), "atomic_op_loop_exit");
+  llvm::BasicBlock* loop_body_bb = llvm::BasicBlock::Create(
+      b_.getContext(), "atomic_op_loop_body", b_.GetInsertBlock()->getParent());
+  b_.SetInsertPoint(loop_body_bb);
   // Change preheader's successor from loop_exit_bb to loop_body_bb.
   loop_preheader_bb->getTerminator()->setSuccessor(0, loop_body_bb);
 
   // Emit the body of the loop that repeatedly invokes atomicCAS.
   //
   // Use cas_old_output to initialize cas_new_output.
-  cas_old_output =
-      ir_builder_.CreateLoad(cas_old_output_address, "cas_old_output");
-  ir_builder_.CreateStore(cas_old_output, cas_new_output_address);
+  cas_old_output = b_.CreateLoad(cas_old_output_address, "cas_old_output");
+  b_.CreateStore(cas_old_output, cas_new_output_address);
   // Emits code to calculate new_output = operation(old_output, source);
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
       computation, {binop_output_address, source_address},
       binop_output_address));
 
   llvm::Value* cas_new_output =
-      ir_builder_.CreateLoad(cas_new_output_address, "cas_new_output");
+      b_.CreateLoad(cas_new_output_address, "cas_new_output");
 
   // Emit code to perform the atomicCAS operation
   // (cas_old_output, success) = atomicCAS(memory_address, cas_old_output,
   //                                       cas_new_output);
-  llvm::Value* ret_value = ir_builder_.CreateAtomicCmpXchg(
+  llvm::Value* ret_value = b_.CreateAtomicCmpXchg(
       atomic_memory_address, cas_old_output, cas_new_output,
       llvm::AtomicOrdering::SequentiallyConsistent,
       llvm::AtomicOrdering::SequentiallyConsistent);
 
   // Extract the memory value returned from atomicCAS and store it as
   // cas_old_output.
-  ir_builder_.CreateStore(
-      ir_builder_.CreateExtractValue(ret_value, 0, "cas_old_output"),
-      cas_old_output_address);
+  b_.CreateStore(b_.CreateExtractValue(ret_value, 0, "cas_old_output"),
+                 cas_old_output_address);
   // Extract the success bit returned from atomicCAS and generate a
   // conditional branch on the success bit.
-  ir_builder_.CreateCondBr(
-      ir_builder_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
-      loop_body_bb);
+  b_.CreateCondBr(b_.CreateExtractValue(ret_value, 1, "success"), loop_exit_bb,
+                  loop_body_bb);
 
   // Set the insertion point to the exit basic block so that the caller of
   // this method can continue emitting code to the right place.
-  SetToFirstInsertPoint(loop_exit_bb, &ir_builder_);
+  SetToFirstInsertPoint(loop_exit_bb, &b_);
   return Status::OK();
 }
 
-void IrEmitter::EmitCompareLoop(
-    int64 dimension_to_sort, const llvm_ir::IrArray::Index& keys_index,
-    const llvm_ir::IrArray::Index& compare_keys_index,
-    const llvm_ir::IrArray& keys_array) {
-  // TODO(b/26783907): parallelize this loop.
-
-  // if (is_smaller_index &&
-  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
-  llvm::Value* is_smaller_index = ir_builder_.CreateICmpSLT(
-      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
-  int64 dimension_to_sort_bound =
-      keys_array.GetShape().dimensions(dimension_to_sort);
-  auto if_data = llvm_ir::EmitIfThenElse(
-      ir_builder_.CreateAnd(
-          is_smaller_index,
-          ir_builder_.CreateICmpSLT(
-              compare_keys_index[dimension_to_sort],
-              keys_index.GetConstantWithIndexType(dimension_to_sort_bound))),
-      "smaller_comparison_index", &ir_builder_, /*emit_else=*/false);
-  SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
-  auto key1 = keys_array.EmitReadArrayElement(keys_index, &ir_builder_);
-  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, &ir_builder_);
-  auto key_type = keys_array.GetShape().element_type();
-  auto comparison =
-      primitive_util::IsFloatingPointType(key_type)
-          // TODO(b/26783907): Figure out how to handle NaNs.
-          ? ir_builder_.CreateFCmp(llvm::FCmpInst::FCMP_ULT, key1, key2)
-          : ir_builder_.CreateICmp(
-                primitive_util::IsSignedIntegralType(key_type)
-                    ? llvm::ICmpInst::ICMP_SLT
-                    : llvm::ICmpInst::ICMP_ULT,
-                key1, key2);
-  auto min_key = ir_builder_.CreateSelect(comparison, key1, key2);
-  auto max_key = ir_builder_.CreateSelect(comparison, key2, key1);
-  keys_array.EmitWriteArrayElement(keys_index, min_key, &ir_builder_);
-  keys_array.EmitWriteArrayElement(compare_keys_index, max_key, &ir_builder_);
-}
-
 Status IrEmitter::EmitAtomicOperationForNestedComputation(
     const HloComputation& computation, llvm::Value* output_address,
     llvm::Value* source_address) {
@@ -604,32 +411,32 @@ Status IrEmitter::HandleTupleSelect(HloInstruction* tuple_select) {
   llvm_ir::EmitTupleSelect(GetIrArray(*tuple_select, *tuple_select),
                            GetIrArray(*pred, *tuple_select),
                            GetBasePointer(*on_true), GetBasePointer(*on_false),
-                           &ir_builder_, module_);
+                           &b_, module_);
   return Status::OK();
 }
 
 namespace {
-llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
-  return ir_builder->CreateExtractValue(x, {0});
-}
-
-llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* ir_builder) {
-  return ir_builder->CreateExtractValue(x, {1});
-}
-
-std::pair<llvm::Value*, llvm::Value*> MultiplyComplex(
-    llvm::Value* lhs_value, llvm::Value* rhs_value,
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Value* lhs_real = Real(lhs_value, ir_builder);
-  llvm::Value* lhs_imag = Imag(lhs_value, ir_builder);
-  llvm::Value* rhs_real = Real(rhs_value, ir_builder);
-  llvm::Value* rhs_imag = Imag(rhs_value, ir_builder);
-  llvm::Value* real_result1 = ir_builder->CreateFMul(lhs_real, rhs_real);
-  llvm::Value* real_result2 = ir_builder->CreateFMul(lhs_imag, rhs_imag);
-  llvm::Value* real_result = ir_builder->CreateFSub(real_result1, real_result2);
-  llvm::Value* imag_result1 = ir_builder->CreateFMul(lhs_real, rhs_imag);
-  llvm::Value* imag_result2 = ir_builder->CreateFMul(lhs_imag, rhs_real);
-  llvm::Value* imag_result = ir_builder->CreateFAdd(imag_result1, imag_result2);
+llvm::Value* Real(llvm::Value* x, llvm::IRBuilder<>* b) {
+  return b->CreateExtractValue(x, {0});
+}
+
+llvm::Value* Imag(llvm::Value* x, llvm::IRBuilder<>* b) {
+  return b->CreateExtractValue(x, {1});
+}
+
+std::pair<llvm::Value*, llvm::Value*> MultiplyComplex(llvm::Value* lhs_value,
+                                                      llvm::Value* rhs_value,
+                                                      llvm::IRBuilder<>* b) {
+  llvm::Value* lhs_real = Real(lhs_value, b);
+  llvm::Value* lhs_imag = Imag(lhs_value, b);
+  llvm::Value* rhs_real = Real(rhs_value, b);
+  llvm::Value* rhs_imag = Imag(rhs_value, b);
+  llvm::Value* real_result1 = b->CreateFMul(lhs_real, rhs_real);
+  llvm::Value* real_result2 = b->CreateFMul(lhs_imag, rhs_imag);
+  llvm::Value* real_result = b->CreateFSub(real_result1, real_result2);
+  llvm::Value* imag_result1 = b->CreateFMul(lhs_real, rhs_imag);
+  llvm::Value* imag_result2 = b->CreateFMul(lhs_imag, rhs_real);
+  llvm::Value* imag_result = b->CreateFAdd(imag_result1, imag_result2);
   return {real_result, imag_result};
 }
 }  // namespace
@@ -645,25 +452,24 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   const Shape& rhs_shape = rhs_instruction->shape();
 
   // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_type = ir_builder_.getInt64Ty();
+  llvm::Type* index_type = b_.getInt64Ty();
   llvm_ir::IrArray::Index element_index(index_type);
   if (ShapeUtil::IsScalar(lhs_shape) && ShapeUtil::IsScalar(rhs_shape)) {
     // If the operands are scalar, don't emit any loops.
     llvm::Value* lhs_value =
-        lhs_array.EmitReadArrayElement(/*index=*/element_index, &ir_builder_);
+        lhs_array.EmitReadArrayElement(/*index=*/element_index, &b_);
     llvm::Value* rhs_value =
-        rhs_array.EmitReadArrayElement(/*index=*/element_index, &ir_builder_);
+        rhs_array.EmitReadArrayElement(/*index=*/element_index, &b_);
     llvm::Value* result;
     if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-      auto value = MultiplyComplex(lhs_value, rhs_value, &ir_builder_);
+      auto value = MultiplyComplex(lhs_value, rhs_value, &b_);
       result = llvm::ConstantAggregateZero::get(lhs_array.GetElementLlvmType());
-      result = ir_builder_.CreateInsertValue(result, value.first, {0});
-      result = ir_builder_.CreateInsertValue(result, value.second, {1});
+      result = b_.CreateInsertValue(result, value.first, {0});
+      result = b_.CreateInsertValue(result, value.second, {1});
     } else {
-      result = ir_builder_.CreateFMul(lhs_value, rhs_value);
+      result = b_.CreateFMul(lhs_value, rhs_value);
     }
-    target_array.EmitWriteArrayElement(/*index=*/element_index, result,
-                                       &ir_builder_);
+    target_array.EmitWriteArrayElement(/*index=*/element_index, result, &b_);
     return Status::OK();
   }
 
@@ -690,11 +496,11 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Create loop nests which loop through the LHS operand dimensions and the RHS
   // operand dimensions. The reduction dimension of the LHS and RHS are handled
   // in a separate innermost loop which performs the sum of products.
-  llvm_ir::ForLoopNest loop_nest(IrName(dot), &ir_builder_);
-  llvm_ir::IrArray::Index lhs_index = EmitOperandArrayLoopNest(
-      lhs_array, lhs_reduction_dimension, "lhs", &loop_nest);
-  llvm_ir::IrArray::Index rhs_index = EmitOperandArrayLoopNest(
-      rhs_array, rhs_reduction_dimension, "rhs", &loop_nest);
+  llvm_ir::ForLoopNest loop_nest(IrName(dot), &b_);
+  llvm_ir::IrArray::Index lhs_index = loop_nest.EmitOperandArrayLoopNest(
+      lhs_array, /*dimension_to_skip=*/lhs_reduction_dimension, "lhs");
+  llvm_ir::IrArray::Index rhs_index = loop_nest.EmitOperandArrayLoopNest(
+      rhs_array, /*dimension_to_skip=*/rhs_reduction_dimension, "rhs");
 
   // Create the reduction loop which does the sum of products reduction.
   std::unique_ptr<llvm_ir::ForLoop> reduction_loop = loop_nest.AddLoop(
@@ -714,7 +520,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   llvm::Value* accum_address = llvm_ir::EmitAllocaAtFunctionEntry(
       accum_type,       // The pointee type of the alloca instruction.
       "accum_address",  // The name of the alloca instruction.
-      &ir_builder_);
+      &b_);
 
   // Initialize the accumulator in the preheader to zero.
   new llvm::StoreInst(
@@ -728,27 +534,25 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   //   updated_accum = accum + lhs_element * rhs_element
   //   *accum_address = updated_accum
   TF_RET_CHECK(!reduction_loop->GetBodyBasicBlock()->empty());
-  ir_builder_.SetInsertPoint(
+  b_.SetInsertPoint(
       &*reduction_loop->GetBodyBasicBlock()->getFirstInsertionPt());
-  llvm::Value* lhs_element =
-      lhs_array.EmitReadArrayElement(lhs_index, &ir_builder_);
-  llvm::Value* rhs_element =
-      rhs_array.EmitReadArrayElement(rhs_index, &ir_builder_);
-  llvm::Value* accum = ir_builder_.CreateLoad(accum_address);
+  llvm::Value* lhs_element = lhs_array.EmitReadArrayElement(lhs_index, &b_);
+  llvm::Value* rhs_element = rhs_array.EmitReadArrayElement(rhs_index, &b_);
+  llvm::Value* accum = b_.CreateLoad(accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
-    auto value = MultiplyComplex(lhs_element, rhs_element, &ir_builder_);
-    llvm::Value* accum_real = Real(accum, &ir_builder_);
-    llvm::Value* real_sum = ir_builder_.CreateFAdd(accum_real, value.first);
-    updated_accum = ir_builder_.CreateInsertValue(accum, real_sum, {0});
-    llvm::Value* accum_imag = Imag(accum, &ir_builder_);
-    llvm::Value* imag_sum = ir_builder_.CreateFAdd(accum_imag, value.second);
-    updated_accum = ir_builder_.CreateInsertValue(updated_accum, imag_sum, {1});
+    auto value = MultiplyComplex(lhs_element, rhs_element, &b_);
+    llvm::Value* accum_real = Real(accum, &b_);
+    llvm::Value* real_sum = b_.CreateFAdd(accum_real, value.first);
+    updated_accum = b_.CreateInsertValue(accum, real_sum, {0});
+    llvm::Value* accum_imag = Imag(accum, &b_);
+    llvm::Value* imag_sum = b_.CreateFAdd(accum_imag, value.second);
+    updated_accum = b_.CreateInsertValue(updated_accum, imag_sum, {1});
   } else {
-    llvm::Value* product = ir_builder_.CreateFMul(lhs_element, rhs_element);
-    updated_accum = ir_builder_.CreateFAdd(accum, product);
+    llvm::Value* product = b_.CreateFMul(lhs_element, rhs_element);
+    updated_accum = b_.CreateFAdd(accum, product);
   }
-  ir_builder_.CreateStore(updated_accum, accum_address);
+  b_.CreateStore(updated_accum, accum_address);
 
   // After the reduction loop exits, store the accumulator into the target
   // address. The index into the target address is the concatenation of the rhs
@@ -765,16 +569,15 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
       target_index.push_back(rhs_index[dimension]);
     }
   }
-  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &ir_builder_);
+  SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), &b_);
   target_array.EmitWriteArrayElement(
       target_index,
-      ir_builder_.CreateLoad(
-          accum_address),  // The value written to the target array.
-      &ir_builder_);
+      b_.CreateLoad(accum_address),  // The value written to the target array.
+      &b_);
 
   // Set the IR builder insert point to the exit basic block of the outer most
   // loop. This ensures later instructions are inserted after this loop nest.
-  ir_builder_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+  b_.SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
   return Status::OK();
 }
@@ -816,11 +619,10 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
       [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
         // Initialize an accumulator with init_value.
         llvm::AllocaInst* accumulator_addr =
-            ir_builder_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
+            b_.CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
                 reduce->shape().element_type(), module_));
-        ir_builder_.CreateStore(
-            ir_builder_.CreateLoad(GetBasePointer(*init_value)),
-            accumulator_addr);
+        b_.CreateStore(b_.CreateLoad(GetBasePointer(*init_value)),
+                       accumulator_addr);
 
         // The enclosing loops go over all the target elements. Now we have to
         // compute the actual target element. For this, we build a new loop nest
@@ -828,12 +630,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
         // AddLoopsForShapeOnDimensions will return an Index where induction
         // Value*s are placed for each dimension in dimensions, and all the rest
         // are nullptrs.
-        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &ir_builder_);
+        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
         const llvm_ir::IrArray::Index reduced_dims_index =
             loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
                                                "reduction_dim");
 
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &ir_builder_);
+        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
 
         // Build a full index for the input argument, using reduced_dims_index
         // as the base. In reduced_dims_index only the reduction dimensions are
@@ -852,13 +654,12 @@ Status IrEmitter::HandleReduce(HloInstruction* reduce) {
 
         // Apply the reduction function to the loaded value.
         llvm::Value* input_address =
-            GetIrArray(*arg, *reduce)
-                .EmitArrayElementAddress(input_index, &ir_builder_);
+            GetIrArray(*arg, *reduce).EmitArrayElementAddress(input_index, &b_);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *function, {accumulator_addr, input_address}, accumulator_addr));
 
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &ir_builder_);
-        return ir_builder_.CreateLoad(accumulator_addr);
+        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
+        return b_.CreateLoad(accumulator_addr);
       });
 }
 
@@ -871,8 +672,8 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   for (HloInstruction* operand : fusion->operands()) {
     parameter_arrays.push_back(GetIrArray(*operand, *fusion));
   }
-  GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_,
-                                          &ir_builder_, GetNestedComputer());
+  GpuElementalIrEmitter elemental_emitter(hlo_module_config_, module_, &b_,
+                                          GetNestedComputer());
   FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
   TF_RETURN_IF_ERROR(fusion->fused_expression_root()->Accept(&fused_emitter));
 
@@ -902,24 +703,6 @@ Status IrEmitter::HandleOutfeed(HloInstruction*) {
   return Unimplemented("Outfeed is not supported on GPU.");
 }
 
-Status IrEmitter::HandleRng(HloInstruction* random) {
-  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
-  for (const HloInstruction* operand : random->operands()) {
-    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
-      return GetIrArray(*operand, *random)
-          .EmitReadArrayElement(index, &ir_builder_);
-    };
-  }
-  // Emits a single-threaded loop because the loop body generated by the element
-  // generator for Rng can't be parallelized (b/32333178).
-  return llvm_ir::LoopEmitter(
-             GpuElementalIrEmitter(hlo_module_config_, module_, &ir_builder_,
-                                   GetNestedComputer())
-                 .MakeElementGenerator(random, operand_to_generator),
-             GetIrArray(*random, *random), &ir_builder_)
-      .EmitLoop(IrName(random));
-}
-
 Status IrEmitter::HandleBatchNormInference(HloInstruction*) {
   return Unimplemented(
       "The GPU backend does not implement BatchNormInference directly.  It "
@@ -943,34 +726,9 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-llvm_ir::IrArray::Index IrEmitter::EmitOperandArrayLoopNest(
-    const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
-    tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest) {
-  // Prepares the dimension list we will use to emit the loop nest. Outermost
-  // loops are added first. Add loops in major-to-minor order, and skip the
-  // reduction dimension.
-  std::vector<int64> dimensions;
-  const Shape& shape = operand_array.GetShape();
-  for (int i = 0; i < LayoutUtil::MinorToMajor(shape).size(); ++i) {
-    int64 dimension = LayoutUtil::Major(shape.layout(), i);
-    if (dimension != reduction_dimension) {
-      dimensions.push_back(dimension);
-    }
-  }
-
-  // Create loop nest with one for-loop for each dimension of the
-  // output.
-  llvm_ir::IrArray::Index index =
-      loop_nest->AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
-  // Verify every dimension except the reduction dimension was set in the index.
-  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
-    if (dimension == reduction_dimension) {
-      DCHECK_EQ(nullptr, index[dimension]);
-    } else {
-      DCHECK_NE(nullptr, index[dimension]);
-    }
-  }
-  return index;
+Status IrEmitter::HandleIota(HloInstruction*) {
+  // TODO(b/64798317): implement iota on GPU.
+  return Unimplemented("Iota is not implemented on GPU.");
 }
 
 StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
@@ -979,16 +737,16 @@ StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
   llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
       llvm_ir::PrimitiveTypeToIrType(
           computation.root_instruction()->shape().element_type(), module_),
-      "return_buffer", &ir_builder_);
+      "return_buffer", &b_);
   std::vector<llvm::Value*> parameter_buffers;
   for (llvm::Value* parameter_element : parameter_elements) {
     parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
-        parameter_element->getType(), "parameter_buffer", &ir_builder_));
-    ir_builder_.CreateStore(parameter_element, parameter_buffers.back());
+        parameter_element->getType(), "parameter_buffer", &b_));
+    b_.CreateStore(parameter_element, parameter_buffers.back());
   }
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers,
                                                  return_buffer));
-  return ir_builder_.CreateLoad(return_buffer);
+  return b_.CreateLoad(return_buffer);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index e9ad4a752bbf94904f238309ac93c1bd23c337d4..80e2a203ac3a1fbe95bf38a886288ea8be130148 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -79,7 +79,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
   Status HandleInfeed(HloInstruction* infeed) override;
   Status HandleOutfeed(HloInstruction* outfeed) override;
-  Status HandleSort(HloInstruction* sort) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleSendDone(HloInstruction* send_done) override;
   Status HandleRecv(HloInstruction* recv) override;
@@ -92,10 +91,10 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status HandleFusion(HloInstruction* fusion) override;
   Status HandleCall(HloInstruction* call) override;
   Status HandleCustomCall(HloInstruction* custom_call) override;
-  Status HandleRng(HloInstruction* random) override;
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  Status HandleIota(HloInstruction* iota) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
@@ -162,7 +161,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
 
   // The following fields track the IR emission state. According to LLVM memory
   // management rules, their memory is owned by the module.
-  llvm::IRBuilder<> ir_builder_;
+  llvm::IRBuilder<> b_;
 
   // Mapping from HLO to its underlying LLVM value.
   HloToIrBindings bindings_;
@@ -171,17 +170,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   const HloModuleConfig& hlo_module_config_;
 
  private:
-  // Emits a series of nested loops for iterating over an operand array in the
-  // dot operation. Loops are constructed in major to minor dimension layout
-  // order. No loop is emitted for the given reduction_dimension. The function
-  // returns an IrArray index for the given operand_array containing the indvars
-  // of the loops. All dimensions of the index are filled except for the
-  // reduction dimension. name_suffix is the string to append to the names of
-  // LLVM constructs (eg, basic blocks) constructed by this method.
-  llvm_ir::IrArray::Index EmitOperandArrayLoopNest(
-      const llvm_ir::IrArray& operand_array, int64 reduction_dimension,
-      tensorflow::StringPiece name_suffix, llvm_ir::ForLoopNest* loop_nest);
-
   // A helper method for EmitAtomicOperationForNestedComputation. Certain
   // computations, such as floating-point addition and integer maximization, can
   // be simply implemented using an LLVM atomic instruction. If "computation" is
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index c9574c87a3be208915b3d6a32679553eb425d2f0..5c827e5f9cf3e1c04af444dae338a2ec411ce372 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -70,10 +70,10 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation(
     argument_dereferenceable_bytes.push_back(root_size);
   }
   // The base pointer of the memory block for all pre-allocated temp buffers.
-  argument_types.push_back(ir_builder_.getInt8PtrTy());
+  argument_types.push_back(b_.getInt8PtrTy());
 
   llvm::FunctionType* function_type =
-      llvm::FunctionType::get(ir_builder_.getVoidTy(), argument_types, false);
+      llvm::FunctionType::get(b_.getVoidTy(), argument_types, false);
   llvm::Function* function = llvm::Function::Create(
       function_type,                       // The function type.
       llvm::GlobalValue::InternalLinkage,  // The linkage type.
@@ -96,8 +96,7 @@ llvm::Function* IrEmitterNested::EmitBasePointersForNestedComputation(
       llvm::BasicBlock::Create(function->getContext(), "entry", function);
   // Emit a "return void" at entry_bb's end, and sets the insert point before
   // that return instruction.
-  ir_builder_.SetInsertPoint(
-      llvm::ReturnInst::Create(function->getContext(), entry_bb));
+  b_.SetInsertPoint(llvm::ReturnInst::Create(function->getContext(), entry_bb));
 
   std::vector<const HloInstruction*> non_io_hlos;
   for (const auto* hlo : nested_computation.instructions()) {
@@ -127,20 +126,17 @@ Status IrEmitterNested::EmitTargetElementLoop(
       target_arrays.push_back(GetIrArray(hlo, hlo, {i}));
     }
     TF_RETURN_IF_ERROR(
-        llvm_ir::LoopEmitter(element_generator, target_arrays, &ir_builder_)
-            .EmitLoop());
+        llvm_ir::LoopEmitter(element_generator, target_arrays, &b_).EmitLoop());
 
     std::vector<llvm::Value*> tuple_operand_ptrs;
     tuple_operand_ptrs.reserve(num_elems);
     for (const llvm_ir::IrArray& array : target_arrays) {
       tuple_operand_ptrs.push_back(array.GetBasePointer());
     }
-    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
-                       module_);
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_);
     return Status::OK();
   }
-  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                              &ir_builder_)
+  return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo), &b_)
       .EmitLoop();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 1caf10a6c165dec27bfe1764fdd9cf0d768efeca..3a5394dac65667ddc5f98e57ca60f5b33cb5e079 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
@@ -59,10 +60,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -71,8 +74,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -213,7 +218,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   llvm::LLVMContext& context = module->getContext();
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
       /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(args.size(), ir_builder_.getInt8PtrTy()),
+      std::vector<llvm::Type*>(args.size(), b_.getInt8PtrTy()),
       /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
@@ -228,9 +233,20 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
     ++arg_it;
 
     kernel->addDereferenceableAttr(arg_no + 1, alloc->size());
+
+    const int64 alignment = [&] {
+      if (alloc->is_entry_computation_parameter()) {
+        return kEntryParameterAlignBytes;
+      } else if (alloc->is_constant()) {
+        return kConstantBufferAlignBytes;
+      } else {
+        return kXlaAllocatedBufferAlignBytes;
+      }
+    }();
+
     kernel->addParamAttr(
-        arg_no, llvm::Attribute::get(context, llvm::Attribute::Alignment,
-                                     kCudaMallocAlignBytes));
+        arg_no,
+        llvm::Attribute::get(context, llvm::Attribute::Alignment, alignment));
 
     if (alloc->IsPreallocatedTempBuffer()) {
       fn_arg->setName("temp_buf");
@@ -249,7 +265,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
       context, {llvm::ConstantAsMetadata::get(kernel),
                 llvm::MDString::get(context, "kernel"),
-                llvm::ConstantAsMetadata::get(ir_builder_.getInt32(1))}));
+                llvm::ConstantAsMetadata::get(b_.getInt32(1))}));
 
   // Update the insert point to the entry basic block.
   llvm::BasicBlock* entry_bb =
@@ -257,7 +273,7 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
 
   // Emit a "return void" at entry_bb's end, and set the insert point before
   // that return instruction.
-  ir_builder_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
+  b_.SetInsertPoint(llvm::ReturnInst::Create(context, entry_bb));
 
   return kernel;
 }
@@ -295,7 +311,7 @@ int ComputeMaxUnrollFactor(const HloInstruction* hlo) {
 //    range of i32.
 // Otherwise, the return type is i64.
 llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
-                                  llvm::IRBuilder<>* ir_builder) {
+                                  llvm::IRBuilder<>* b) {
   // Find the unnested hlo instructon for which the kernel is generated for.
   const HloInstruction* unnested_hlo = hlo;
   const HloComputation* computation = hlo->parent();
@@ -316,7 +332,7 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
     return in_range;
   };
 
-  llvm::Type* i64_ty = ir_builder->getInt64Ty();
+  llvm::Type* i64_ty = b->getInt64Ty();
   // Check launch dimension
   if (!IsInt32(launch_size)) {
     return i64_ty;
@@ -345,7 +361,7 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
     }
   }
 
-  return ir_builder->getInt32Ty();
+  return b->getInt32Ty();
 }
 
 }  // namespace
@@ -600,8 +616,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
           parameter_arrays.push_back(GetIrArray(*operand, *fusion));
         }
         GpuElementalIrEmitter elemental_emitter(
-            hlo_module_config_, ir_emitter_context_->llvm_module(),
-            &ir_builder_, GetNestedComputer());
+            hlo_module_config_, ir_emitter_context_->llvm_module(), &b_,
+            GetNestedComputer());
         FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
@@ -674,7 +690,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     }
     GpuElementalIrEmitter elemental_emitter(hlo_module_config_,
                                             ir_emitter_context_->llvm_module(),
-                                            &ir_builder_, GetNestedComputer());
+                                            &b_, GetNestedComputer());
 
     // Shape of the dynamic-update-slice's "update" operand.
     Shape update_shape = root->operand(1)->shape();
@@ -692,7 +708,7 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
 
     return llvm_ir::EmitParallelFusedDynamicUpdateSliceInPlace(
         fusion, operand_arrays, output_array, &elemental_emitter,
-        launch_dimensions, &ir_builder_);
+        launch_dimensions, &b_);
   }
 
   if (ImplementedAsGemm(*fusion)) {
@@ -740,11 +756,11 @@ Status IrEmitterUnnested::EmitExtraOutputsForReduce(
     const HloInstruction* output = reduce->parent()->FusionInstruction();
     llvm::Value* extra_output_address =
         GetIrArray(*output, *output, extra_output_gens[i].second)
-            .EmitArrayElementAddress(index, &ir_builder_,
+            .EmitArrayElementAddress(index, &b_,
                                      "extra_output_element_address");
     TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
                         extra_output_gens[i].first(index));
-    ir_builder_.CreateStore(extra_output_ir_value, extra_output_address);
+    b_.CreateStore(extra_output_ir_value, extra_output_address);
   }
   return Status::OK();
 }
@@ -774,8 +790,8 @@ Status IrEmitterUnnested::EmitReductionToScalar(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       tiled_input_shape, ir_emitter_context_->device_description());
 
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      reduce, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -825,52 +841,51 @@ Status IrEmitterUnnested::EmitReductionToScalar(
         llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
     std::vector<llvm::Value*> partial_reduction_result_addresses;
     for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-          element_ir_type, /*ArraySize=*/nullptr,
-          "partial_reduction_result." + llvm::Twine(i));
+      llvm::Value* partial_reduction_result_address =
+          b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                          "partial_reduction_result." + llvm::Twine(i));
       TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                           init_value_gens[i](IrArray::Index(index_ty)));
-      ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      b_.CreateStore(init_ir_value, partial_reduction_result_address);
       partial_reduction_result_addresses.push_back(
           partial_reduction_result_address);
     }
 
     llvm::Value* x_in_tiles = tile_index[0];
-    x_in_tiles = ir_builder_.CreateZExtOrTrunc(x_in_tiles, index_ty);
+    x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty);
 
     // Emit an inner for-loop that reduces the elements in the tile.
     auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
       std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
-                                        index_typed_constant(0),
-                                        index_typed_constant(kTileSize),
-                                        index_typed_constant(1), &ir_builder_);
+          llvm_ir::ForLoop::EmitForLoop(
+              "element_id_in_tile", index_typed_constant(0),
+              index_typed_constant(kTileSize), index_typed_constant(1), &b_);
 
       // Emit the body of the partial reduction loop.
       llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &ir_builder_);
-      llvm::Value* x = ir_builder_.CreateNSWAdd(
-          ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)),
+                                     &b_);
+      llvm::Value* x = b_.CreateNSWAdd(
+          b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)),
           tile_element_loop->GetIndVarValue());
       // Unless we know the tile is entirely in bounds, we have to emit a
       // x-in-bounds check before reading from the input.
       if (!tile_in_bounds) {
         llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ir_builder_.CreateICmpULT(x, index_typed_constant(num_elems)),
-            "x_in_bounds", &ir_builder_);
+            b_.CreateICmpULT(x, index_typed_constant(num_elems)), "x_in_bounds",
+            &b_);
 
         // Emit code that reads the input element and accumulates it to
         // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
       }
 
       IrArray::Index input_index(
-          /*linear=*/x, input_shape, &ir_builder_);
-      llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
+          /*linear=*/x, input_shape, &b_);
+      llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
       for (int i = 0; i != num_reduces; ++i) {
         TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                             input_gens[i](input_index));
-        ir_builder_.CreateStore(input_ir_value, input_address);
+        b_.CreateStore(input_ir_value, input_address);
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], input_address},
@@ -881,52 +896,48 @@ Status IrEmitterUnnested::EmitReductionToScalar(
 
     // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
     // immediately beyond the tile.
-    llvm::Value* x_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* x_end = b_.CreateNSWAdd(
         index_typed_constant(kTileSize),
-        ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)));
+        b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileSize)));
     // The tile is entirely in bound if all_threads_in_bounds or
     // x_end <= num_elems.
-    llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(x_end, index_typed_constant(num_elems)),
-        ir_builder_.getInt1(all_threads_in_bounds));
+    llvm::Value* tile_in_bounds =
+        b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(num_elems)),
+                    b_.getInt1(all_threads_in_bounds));
     llvm_ir::LlvmIfData if_tile_in_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block,
-                                   &ir_builder_);
+        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
 
     // After the if-then-else statement on tile_in_bounds, emit calls to
     // shfl_down that accumulate the partial reduction results of all threads
     // from the warp.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block, &b_);
     int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
     // bitcast cannot be applied to aggregate types (even packed ones), so we
     // instead bitcast addresses of load/store to intN* of the same bit-width.
     llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? ir_builder_.getIntNTy(bit_width)
+                                      ? b_.getIntNTy(bit_width)
                                       : element_ir_type;
     for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
-          element_ir_type, nullptr, "result_from_other_lane");
+      llvm::Value* result_from_other_lane =
+          b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane");
       for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
-                                      shuffle_ir_type->getPointerTo()),
+        llvm::Value* partial_reduction_result = b_.CreateLoad(
+            b_.CreateBitCast(partial_reduction_result_addresses[i],
+                             shuffle_ir_type->getPointerTo()),
             "partial_reduction_result");
         CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
             << "Requires block size a multiple of the warp size, otherwise we "
                "will read undefined elements.";
-        ir_builder_.CreateStore(
+        b_.CreateStore(
             EmitFullWarpShuffleDown(partial_reduction_result,
-                                    ir_builder_.getInt32(shuffle_distance),
-                                    &ir_builder_),
-            ir_builder_.CreateBitCast(result_from_other_lane,
-                                      shuffle_ir_type->getPointerTo()));
+                                    b_.getInt32(shuffle_distance), &b_),
+            b_.CreateBitCast(result_from_other_lane,
+                             shuffle_ir_type->getPointerTo()));
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], result_from_other_lane},
@@ -940,24 +951,23 @@ Status IrEmitterUnnested::EmitReductionToScalar(
     // Emit an atomic operation that accumulates the partial reduction result of
     // lane 0 (which holds the partially accumulated result for its warp) to the
     // output element.
-    llvm::Value* lane_id = ir_builder_.CreateURem(
-        x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
+    llvm::Value* lane_id =
+        b_.CreateURem(x_in_tiles, index_typed_constant(kWarpSize), "lane_id");
     llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateICmpEQ(lane_id, index_typed_constant(0)),
-        "lane_id_is_zero", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
-                                   &ir_builder_);
+        b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero",
+        &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
 
     for (int i = 0; i != num_reduces; ++i) {
       llvm::Value* output_address =
           GetIrArray(*output, *output, reduce_output_shapes[i])
               .EmitArrayElementAddress(
                   IrArray::Index(
-                      /*linear=*/ir_builder_.getInt64(0),
+                      /*linear=*/b_.getInt64(0),
                       ShapeUtil::GetSubshape(output->shape(),
                                              reduce_output_shapes[i]),
-                      &ir_builder_),
-                  &ir_builder_, "output_element_address");
+                      &b_),
+                  &b_, "output_element_address");
       TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
           *reducers[i], output_address, partial_reduction_result_addresses[i]));
     }
@@ -971,7 +981,7 @@ Status IrEmitterUnnested::EmitReductionToScalar(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1015,7 +1025,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       tiled_input_shape, ir_emitter_context_->device_description());
 
   // TODO(b/110211620): Convert to use i32 index_type when it is possible.
-  llvm::Type* index_ty = ir_builder_.getInt64Ty();
+  llvm::Type* index_ty = b_.getInt64Ty();
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -1065,14 +1075,12 @@ Status IrEmitterUnnested::EmitColumnReduction(
     for (int i = 0; i != num_reduces; ++i) {
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
         llvm::Value* partial_reduction_result_address =
-            ir_builder_.CreateAlloca(
-                element_ir_type, /*ArraySize=*/nullptr,
-                "partial_reduction_result." +
-                    llvm::Twine(i * kTileWidth + x_offset));
+            b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                            "partial_reduction_result." +
+                                llvm::Twine(i * kTileWidth + x_offset));
         TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                             init_value_gens[i](IrArray::Index(index_ty)));
-        ir_builder_.CreateStore(init_ir_value,
-                                partial_reduction_result_address);
+        b_.CreateStore(init_ir_value, partial_reduction_result_address);
         partial_reduction_result_addresses.push_back(
             partial_reduction_result_address);
       }
@@ -1083,50 +1091,47 @@ Status IrEmitterUnnested::EmitColumnReduction(
     llvm::Value* y_in_tiles = tile_index[0];
     llvm::Value* x_in_tiles = tile_index[1];
 
-    y_in_tiles = ir_builder_.CreateZExtOrTrunc(y_in_tiles, index_ty);
-    x_in_tiles = ir_builder_.CreateZExtOrTrunc(x_in_tiles, index_ty);
+    y_in_tiles = b_.CreateZExtOrTrunc(y_in_tiles, index_ty);
+    x_in_tiles = b_.CreateZExtOrTrunc(x_in_tiles, index_ty);
 
     auto emit_tile_element_loop = [=](bool tile_in_y_bounds,
                                       bool tile_in_x_bounds) -> Status {
       std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
-                                        index_typed_constant(0),
-                                        index_typed_constant(kTileHeight),
-                                        index_typed_constant(1), &ir_builder_);
+          llvm_ir::ForLoop::EmitForLoop(
+              "element_id_in_tile", index_typed_constant(0),
+              index_typed_constant(kTileHeight), index_typed_constant(1), &b_);
 
       // Emit the body of the partial reduction loop.
       llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &ir_builder_);
-      llvm::Value* y = ir_builder_.CreateNSWAdd(
-          ir_builder_.CreateNSWMul(y_in_tiles,
-                                   index_typed_constant(kTileHeight)),
+                                     &b_);
+      llvm::Value* y = b_.CreateNSWAdd(
+          b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight)),
           tile_element_loop->GetIndVarValue());
 
       // Unless we know that y is in bounds, we have to emit a check before
       // reading from the input.
       if (!tile_in_y_bounds) {
         llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            ir_builder_.CreateICmpULT(y, index_typed_constant(height)),
-            "y_in_bounds", &ir_builder_);
+            b_.CreateICmpULT(y, index_typed_constant(height)), "y_in_bounds",
+            &b_);
 
         // Emit code that reads the input element and accumulates it to
         // the partial reduction result.
-        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+        llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
       }
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x = ir_builder_.CreateNSWAdd(
-            ir_builder_.CreateNSWMul(x_in_tiles,
-                                     index_typed_constant(kTileWidth)),
+        llvm::Value* x = b_.CreateNSWAdd(
+            b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
             index_typed_constant(x_offset));
         // Unless we know that x is in bounds, we have to emit a check before
         // reading from the input.
         if (!tile_in_x_bounds) {
           llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-              ir_builder_.CreateICmpULT(x, index_typed_constant(width)),
-              "x_in_bounds", &ir_builder_);
-          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &ir_builder_);
+              b_.CreateICmpULT(x, index_typed_constant(width)), "x_in_bounds",
+              &b_);
+          llvm_ir::SetToFirstInsertPoint(if_data.true_block, &b_);
         }
-        llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
+        llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
         // {y,x} is an index to input_matrix_shape [height,width]. We need to
         // convert that to an index to input_shape (the shape of the operand of
         // "reduce"). This conversion is composed of a transposition from
@@ -1143,18 +1148,17 @@ Status IrEmitterUnnested::EmitColumnReduction(
             ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
                                                      {height, width});
         const IrArray::Index input_matrix_index({y, x}, input_matrix_shape,
-                                                &ir_builder_);
+                                                &b_);
         const IrArray::Index input_index =
             input_matrix_index
                 .SourceIndexOfReshape(input_matrix_shape,
-                                      normalized_input_shape, &ir_builder_)
+                                      normalized_input_shape, &b_)
                 .SourceIndexOfTranspose(normalized_input_shape, input_shape,
-                                        transpose_dimension_mapping,
-                                        &ir_builder_);
+                                        transpose_dimension_mapping, &b_);
         for (int i = 0; i != num_reduces; ++i) {
           TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                               input_gens[i](input_index));
-          ir_builder_.CreateStore(input_ir_value, input_address);
+          b_.CreateStore(input_ir_value, input_address);
           TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
               *reducers[i],
               {partial_reduction_result_addresses[i * kTileWidth + x_offset],
@@ -1169,64 +1173,55 @@ Status IrEmitterUnnested::EmitColumnReduction(
 
     // y_end = kTileHeight + y_in_tiles * kTileHeight, i.e., the y location
     // that's immediately beyond the tile.
-    llvm::Value* y_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* y_end = b_.CreateNSWAdd(
         index_typed_constant(kTileHeight),
-        ir_builder_.CreateNSWMul(y_in_tiles,
-                                 index_typed_constant(kTileHeight)));
+        b_.CreateNSWMul(y_in_tiles, index_typed_constant(kTileHeight)));
     // x_end = kTileWidth + x_in_tiles * kTileWidth, i.e., the x location
     // that's immediately beyond the tile.
-    llvm::Value* x_end = ir_builder_.CreateNSWAdd(
+    llvm::Value* x_end = b_.CreateNSWAdd(
         index_typed_constant(kTileWidth),
-        ir_builder_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
-    llvm::Value* tile_in_y_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(y_end, index_typed_constant(height)),
-        ir_builder_.getInt1(height % kTileHeight == 0));
-    llvm::Value* tile_in_x_bounds = ir_builder_.CreateOr(
-        ir_builder_.CreateICmpULE(x_end, index_typed_constant(width)),
-        ir_builder_.getInt1(width % kTileWidth == 0));
+        b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)));
+    llvm::Value* tile_in_y_bounds =
+        b_.CreateOr(b_.CreateICmpULE(y_end, index_typed_constant(height)),
+                    b_.getInt1(height % kTileHeight == 0));
+    llvm::Value* tile_in_x_bounds =
+        b_.CreateOr(b_.CreateICmpULE(x_end, index_typed_constant(width)),
+                    b_.getInt1(width % kTileWidth == 0));
     // The tile is in y bounds if "height" is a multiple of kTileHeight or
     // y_end <= height.
-    llvm_ir::LlvmIfData if_tile_in_y_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_y_bounds, "tile_in_y_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::LlvmIfData if_tile_in_y_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_y_bounds, "tile_in_y_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.true_block, &b_);
     // The tile is in x bounds if "width" is a multiple of kTileWidth or
     // x_end <= width.
-    llvm_ir::LlvmIfData if_tile_in_x_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_x_bounds, "tile_in_x_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::LlvmIfData if_tile_in_x_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
                                               /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/true,
                                               /*tile_in_x_bounds=*/false));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block,
-                                   &ir_builder_);
-    if_tile_in_x_bounds_data = llvm_ir::EmitIfThenElse(
-        tile_in_x_bounds, "tile_in_x_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.false_block, &b_);
+    if_tile_in_x_bounds_data =
+        llvm_ir::EmitIfThenElse(tile_in_x_bounds, "tile_in_x_bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.true_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
                                               /*tile_in_x_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_x_bounds_data.false_block, &b_);
     TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_y_bounds=*/false,
                                               /*tile_in_x_bounds=*/false));
 
     // After the nested if-then-else statement on tile_in_y_bounds and
     // tile_in_x_bounds, emit atomic operations to accumulate the partial
     // reduction result to the output element.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block,
-                                   &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_tile_in_y_bounds_data.after_block, &b_);
     const HloInstruction* output =
         reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
     for (int i = 0; i != num_reduces; ++i) {
       for (int x_offset = 0; x_offset < kTileWidth; ++x_offset) {
-        llvm::Value* x = ir_builder_.CreateNSWAdd(
-            ir_builder_.CreateNSWMul(x_in_tiles,
-                                     index_typed_constant(kTileWidth)),
+        llvm::Value* x = b_.CreateNSWAdd(
+            b_.CreateNSWMul(x_in_tiles, index_typed_constant(kTileWidth)),
             index_typed_constant(x_offset));
         llvm::Value* output_address =
             GetIrArray(*output, *output, reduce_output_shapes[i])
@@ -1235,8 +1230,8 @@ Status IrEmitterUnnested::EmitColumnReduction(
                         x,
                         ShapeUtil::GetSubshape(output->shape(),
                                                reduce_output_shapes[i]),
-                        &ir_builder_),
-                    &ir_builder_, "output_element_address");
+                        &b_),
+                    &b_, "output_element_address");
         TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
             *reducers[i], output_address,
             partial_reduction_result_addresses[i * kTileWidth + x_offset]));
@@ -1252,7 +1247,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1402,8 +1397,8 @@ Status IrEmitterUnnested::EmitRowReduction(
       {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0});
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       tiled_input_shape, ir_emitter_context_->device_description());
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      reduce, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(reduce, launch_dimensions.launch_bound(), &b_);
 
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
@@ -1415,12 +1410,12 @@ Status IrEmitterUnnested::EmitRowReduction(
         input_shape.element_type(), ir_emitter_context_->llvm_module());
     std::vector<llvm::Value*> partial_reduction_result_addresses;
     for (int i = 0; i != num_reduces; ++i) {
-      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-          element_ir_type, /*ArraySize=*/nullptr,
-          "partial_reduction_result." + llvm::Twine(i));
+      llvm::Value* partial_reduction_result_address =
+          b_.CreateAlloca(element_ir_type, /*ArraySize=*/nullptr,
+                          "partial_reduction_result." + llvm::Twine(i));
       TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
                           init_value_gens[i](IrArray::Index(index_ty)));
-      ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      b_.CreateStore(init_ir_value, partial_reduction_result_address);
       partial_reduction_result_addresses.push_back(
           partial_reduction_result_address);
     }
@@ -1429,25 +1424,25 @@ Status IrEmitterUnnested::EmitRowReduction(
     llvm::Value* y = tile_index[1];
     llvm::Value* x_tile = tile_index[2];
 
-    x_tile = ir_builder_.CreateZExtOrTrunc(x_tile, index_ty);
+    x_tile = b_.CreateZExtOrTrunc(x_tile, index_ty);
 
-    llvm::Value* warp_id = ir_builder_.CreateUDiv(
-        x_tile, index_typed_constant(kWarpSize), "warp_id");
-    llvm::Value* lane_id = ir_builder_.CreateURem(
-        x_tile, index_typed_constant(kWarpSize), "lane_id");
+    llvm::Value* warp_id =
+        b_.CreateUDiv(x_tile, index_typed_constant(kWarpSize), "warp_id");
+    llvm::Value* lane_id =
+        b_.CreateURem(x_tile, index_typed_constant(kWarpSize), "lane_id");
 
     // The x-location of the last element in this z-x-tile.
     // last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id * x_tile_size);
-    llvm::Value* last_x = ir_builder_.CreateNSWAdd(
-        lane_id, ir_builder_.CreateNSWMul(
-                     index_typed_constant(kWarpSize),
-                     ir_builder_.CreateNSWAdd(
-                         index_typed_constant(x_tile_size - 1),
-                         ir_builder_.CreateNSWMul(
-                             warp_id, index_typed_constant(x_tile_size)))));
+    llvm::Value* last_x = b_.CreateNSWAdd(
+        lane_id,
+        b_.CreateNSWMul(
+            index_typed_constant(kWarpSize),
+            b_.CreateNSWAdd(
+                index_typed_constant(x_tile_size - 1),
+                b_.CreateNSWMul(warp_id, index_typed_constant(x_tile_size)))));
 
     KernelSupportLibrary ksl(
-        &ir_builder_,
+        &b_,
         /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll,
         /*prevent_vectorization=*/false);
 
@@ -1456,9 +1451,9 @@ Status IrEmitterUnnested::EmitRowReduction(
     auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds,
                                           int64 x_tile_loop_bound) -> Status {
       auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status {
-        llvm::Value* z = ir_builder_.CreateNSWAdd(
-            z_indvar, ir_builder_.CreateNSWMul(
-                          index_typed_constant(z_tile_size), z_tile));
+        llvm::Value* z = b_.CreateNSWAdd(
+            z_indvar,
+            b_.CreateNSWMul(index_typed_constant(z_tile_size), z_tile));
         TF_RETURN_IF_ERROR(ksl.For(
             "x_tile",
             /*start=*/index_typed_constant(0),
@@ -1466,12 +1461,12 @@ Status IrEmitterUnnested::EmitRowReduction(
             /*step=*/1, [&](llvm::Value* x_indvar) -> Status {
               // x = lane_id +
               //     warpSize * (element_id_in_x_tile + warp_id * x_tile_size);
-              llvm::Value* x = ir_builder_.CreateNSWAdd(
+              llvm::Value* x = b_.CreateNSWAdd(
                   lane_id,
-                  ir_builder_.CreateNSWMul(
+                  b_.CreateNSWMul(
                       index_typed_constant(kWarpSize),
-                      ir_builder_.CreateNSWAdd(
-                          x_indvar, ir_builder_.CreateNSWMul(
+                      b_.CreateNSWAdd(
+                          x_indvar, b_.CreateNSWMul(
                                         warp_id, llvm::ConstantInt::get(
                                                      index_ty, x_tile_size)))));
 
@@ -1479,18 +1474,17 @@ Status IrEmitterUnnested::EmitRowReduction(
               // emit a x-in-bounds check before reading from the input.
               if (!x_tile_in_bounds) {
                 llvm_ir::LlvmIfData if_x_in_bounds_data =
-                    llvm_ir::EmitIfThenElse(ir_builder_.CreateICmpULT(
-                                                x, index_typed_constant(width)),
-                                            "x_in_bounds", &ir_builder_);
-                // Points ir_builder_ to the then-block.
+                    llvm_ir::EmitIfThenElse(
+                        b_.CreateICmpULT(x, index_typed_constant(width)),
+                        "x_in_bounds", &b_);
+                // Points b_ to the then-block.
                 llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
-                                               &ir_builder_);
+                                               &b_);
               }
 
               // Emit code that reads the input element and accumulates it
               // to the partial reduction result.
-              llvm::Value* input_address =
-                  ir_builder_.CreateAlloca(element_ir_type);
+              llvm::Value* input_address = b_.CreateAlloca(element_ir_type);
               {
                 // {z,y,x} is an index to input_3d_tensor_shape
                 // [depth,height,width]. We need to convert that to an index
@@ -1509,20 +1503,19 @@ Status IrEmitterUnnested::EmitRowReduction(
                     ShapeUtil::MakeShapeWithDescendingLayout(
                         input_shape.element_type(), {depth, height, width});
                 const IrArray::Index input_3d_tensor_index(
-                    {z, y, x}, input_3d_tensor_shape, &ir_builder_);
+                    {z, y, x}, input_3d_tensor_shape, &b_);
                 const IrArray::Index input_index =
                     input_3d_tensor_index
                         .SourceIndexOfReshape(input_3d_tensor_shape,
-                                              normalized_input_shape,
-                                              &ir_builder_)
+                                              normalized_input_shape, &b_)
                         .SourceIndexOfTranspose(
                             normalized_input_shape, input_shape,
-                            transpose_dimension_mapping, &ir_builder_);
+                            transpose_dimension_mapping, &b_);
 
                 for (int i = 0; i != num_reduces; ++i) {
                   TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
                                       input_gens[i](input_index));
-                  ir_builder_.CreateStore(input_ir_value, input_address);
+                  b_.CreateStore(input_ir_value, input_address);
                   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
                       *reducers[i],
                       {partial_reduction_result_addresses[i], input_address},
@@ -1541,9 +1534,9 @@ Status IrEmitterUnnested::EmitRowReduction(
                      /*step=*/1, emit_z_tile_element_loop);
     };
 
-    llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
-        ir_builder_.getInt1(width % (x_tile_size * kWarpSize) == 0),
-        ir_builder_.CreateICmpULT(last_x, index_typed_constant(width)));
+    llvm::Value* tile_in_bounds =
+        b_.CreateOr(b_.getInt1(width % (x_tile_size * kWarpSize) == 0),
+                    b_.CreateICmpULT(last_x, index_typed_constant(width)));
 
     TF_RETURN_IF_ERROR(
         ksl.If(tile_in_bounds,
@@ -1566,26 +1559,25 @@ Status IrEmitterUnnested::EmitRowReduction(
     // bitcast cannot be applied to aggregate types (even packed ones), so we
     // instead bitcast addresses of load/store to intN* of the same bit-width.
     llvm::Type* shuffle_ir_type = element_ir_type->isStructTy()
-                                      ? ir_builder_.getIntNTy(bit_width)
+                                      ? b_.getIntNTy(bit_width)
                                       : element_ir_type;
     for (int shuffle_distance = 16; shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
-          element_ir_type, nullptr, "result_from_other_lane");
+      llvm::Value* result_from_other_lane =
+          b_.CreateAlloca(element_ir_type, nullptr, "result_from_other_lane");
       for (int i = 0; i != num_reduces; ++i) {
-        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
-                                      shuffle_ir_type->getPointerTo()),
+        llvm::Value* partial_reduction_result = b_.CreateLoad(
+            b_.CreateBitCast(partial_reduction_result_addresses[i],
+                             shuffle_ir_type->getPointerTo()),
             "partial_reduction_result");
         CHECK_EQ(launch_dimensions.threads_per_block() % kWarpSize, 0)
             << "Requires block size a multiple of the warp size, otherwise we "
                "will read undefined elements.";
-        ir_builder_.CreateStore(
+        b_.CreateStore(
             EmitFullWarpShuffleDown(partial_reduction_result,
-                                    ir_builder_.getInt32(shuffle_distance),
-                                    &ir_builder_),
-            ir_builder_.CreateBitCast(result_from_other_lane,
-                                      shuffle_ir_type->getPointerTo()));
+                                    b_.getInt32(shuffle_distance), &b_),
+            b_.CreateBitCast(result_from_other_lane,
+                             shuffle_ir_type->getPointerTo()));
         TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
             *reducers[i],
             {partial_reduction_result_addresses[i], result_from_other_lane},
@@ -1600,10 +1592,9 @@ Status IrEmitterUnnested::EmitRowReduction(
     // lane 0 (which holds the partially accumulated result for its warp) to the
     // output element.
     llvm_ir::LlvmIfData if_lane_id_is_zero_data = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateICmpEQ(lane_id, index_typed_constant(0)),
-        "lane_id_is_zero", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
-                                   &ir_builder_);
+        b_.CreateICmpEQ(lane_id, index_typed_constant(0)), "lane_id_is_zero",
+        &b_);
+    llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block, &b_);
     for (int i = 0; i != num_reduces; ++i) {
       llvm::Value* output_address =
           GetIrArray(*output, *output, reduce_output_shapes[i])
@@ -1611,8 +1602,8 @@ Status IrEmitterUnnested::EmitRowReduction(
                   IrArray::Index(y,
                                  ShapeUtil::GetSubshape(
                                      output->shape(), reduce_output_shapes[i]),
-                                 &ir_builder_),
-                  &ir_builder_, "output_element_address");
+                                 &b_),
+                  &b_, "output_element_address");
       // We don't need to emit atomic operations if there is only one tile of
       // results. 'depth' is the z dimension, 'width' is the x dimension.
       if (z_tile_size >= depth && x_tile_size >= width) {
@@ -1636,7 +1627,7 @@ Status IrEmitterUnnested::EmitRowReduction(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, tiled_input_shape,
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(reduce), index_ty);
 }
 
@@ -1762,12 +1753,11 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 
     return EmitReductionToVector(
         reduce, input->shape(), {[&](const IrArray::Index& index) {
-          return GetIrArray(*input, *reduce)
-              .EmitReadArrayElement(index, &ir_builder_);
+          return GetIrArray(*input, *reduce).EmitReadArrayElement(index, &b_);
         }},
         {[&](const IrArray::Index& index) {
           return GetIrArray(*init_value, *reduce)
-              .EmitReadArrayElement(index, &ir_builder_);
+              .EmitReadArrayElement(index, &b_);
         }},
         dimensions_to_reduce, {reducer}, {{}}, {});
   }
@@ -1780,9 +1770,12 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
 Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
   bool all_tuple_elements_have_buffer =
       c_all_of(tuple->operands(), [&](HloInstruction* tuple_element) {
-        return ir_emitter_context_->buffer_assignment().HasTopLevelAllocation(
-            tuple_element);
+        return ir_emitter_context_->buffer_assignment()
+            .GetUniqueTopLevelSlice(tuple_element)
+            .ok();
       });
+  // TODO(b/111689850): This logic isn't quite correct.
+  //
   // Tuples (especially tuples that are the final result of a computation) can
   // be so huge that if we were to emit a kernel that took each tuple element as
   // a parameter, we would exceed the max allowable number of parameters to a
@@ -1790,9 +1783,9 @@ Status IrEmitterUnnested::HandleTuple(HloInstruction* tuple) {
   // buffer, we collect their buffer addresses in a host array, and then copy
   // that array to the tuple's buffer.
   //
-  // Some tuple elements (e.g. const or bitcast of const) might not have a
-  // buffer -- their contents are stored in code. In that case, we fall back to
-  // emitting kernels which have access to their buffer addresses in code.
+  // Some tuple elements might not have an unambiguous buffer (like the result
+  // of a select-tuple). In that case, we fall back to emitting kernels which
+  // have access to their buffer addresses in code.
   if (all_tuple_elements_have_buffer) {
     std::vector<BufferAllocation::Slice> tuple_element_buffers;
     for (const HloInstruction* tuple_element : tuple->operands()) {
@@ -1842,7 +1835,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       source->shape(), ir_emitter_context_->device_description());
   llvm::Type* index_type = GetIndexTypeForKernel(
-      select_and_scatter, launch_dimensions.launch_bound(), &ir_builder_);
+      select_and_scatter, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_type, c);
   };
@@ -1873,19 +1866,18 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
         llvm_ir::PrimitiveTypeToIrType(operand_element_type,
                                        ir_emitter_context_->llvm_module()),
-        "selected_value_address", &ir_builder_);
+        "selected_value_address", &b_);
     llvm::Value* selected_index_address =
         llvm_ir::EmitAllocaAtFunctionEntryWithCount(
             index_type, index_typed_constant(rank), "selected_index_address",
-            &ir_builder_);
+            &b_);
     llvm::Value* initialized_flag_address = llvm_ir::EmitAllocaAtFunctionEntry(
-        ir_builder_.getInt1Ty(), "initialized_flag_address", &ir_builder_);
-    ir_builder_.CreateStore(ir_builder_.getInt1(false),
-                            initialized_flag_address);
+        b_.getInt1Ty(), "initialized_flag_address", &b_);
+    b_.CreateStore(b_.getInt1(false), initialized_flag_address);
 
     // Create the inner loop to iterate over the window.
-    llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"),
-                                      &ir_builder_, index_type);
+    llvm_ir::ForLoopNest window_loops(IrName(select_and_scatter, "inner"), &b_,
+                                      index_type);
     std::vector<int64> window_size;
     for (const auto& dim : window.dimensions()) {
       window_size.push_back(dim.size());
@@ -1894,84 +1886,79 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     const IrArray::Index window_index = window_loops.AddLoopsForShape(
         ShapeUtil::MakeShape(operand_element_type, window_size), "window");
     llvm_ir::SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
-                                   &ir_builder_);
+                                   &b_);
 
     // Compute the operand index to visit and evaluate the condition whether the
     // operand index is within the bounds. The unsigned comparison includes
     // checking whether the operand index >= 0.
     IrArray::Index operand_index(index_type, source_index.size());
-    llvm::Value* in_bounds_condition = ir_builder_.getInt1(true);
+    llvm::Value* in_bounds_condition = b_.getInt1(true);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* strided_index = ir_builder_.CreateNSWMul(
+      llvm::Value* strided_index = b_.CreateNSWMul(
           source_index[i], index_typed_constant(window.dimensions(i).stride()));
-      operand_index[i] = ir_builder_.CreateNSWSub(
-          ir_builder_.CreateNSWAdd(strided_index, window_index[i]),
+      operand_index[i] = b_.CreateNSWSub(
+          b_.CreateNSWAdd(strided_index, window_index[i]),
           index_typed_constant(window.dimensions(i).padding_low()));
-      llvm::Value* index_condition = ir_builder_.CreateICmpULT(
+      llvm::Value* index_condition = b_.CreateICmpULT(
           operand_index[i],
           index_typed_constant(ShapeUtil::GetDimension(operand->shape(), i)));
-      in_bounds_condition =
-          ir_builder_.CreateAnd(in_bounds_condition, index_condition);
+      in_bounds_condition = b_.CreateAnd(in_bounds_condition, index_condition);
     }
     CHECK(in_bounds_condition != nullptr);
 
     // Only need to do something if the operand index is within the bounds.
     // First check if the initialized_flag is set.
     llvm_ir::LlvmIfData if_in_bounds =
-        llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &ir_builder_);
+        llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &b_);
     llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse(
-        ir_builder_.CreateLoad(initialized_flag_address), "initialized",
-        &ir_builder_);
+        b_.CreateLoad(initialized_flag_address), "initialized", &b_);
 
     // If the initialized_flag is false, initialize the selected value and index
     // with the currently visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &b_);
     const auto save_operand_index = [&](const IrArray::Index& operand_index) {
       for (int64 i = 0; i < rank; ++i) {
         llvm::Value* selected_index_address_slot =
-            ir_builder_.CreateInBoundsGEP(selected_index_address,
-                                          {ir_builder_.getInt32(i)});
-        ir_builder_.CreateStore(operand_index[i], selected_index_address_slot);
+            b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+        b_.CreateStore(operand_index[i], selected_index_address_slot);
       }
     };
     IrArray operand_array = GetIrArray(*operand, *select_and_scatter);
     llvm::Value* operand_data =
-        operand_array.EmitReadArrayElement(operand_index, &ir_builder_);
-    ir_builder_.CreateStore(operand_data, selected_value_address);
+        operand_array.EmitReadArrayElement(operand_index, &b_);
+    b_.CreateStore(operand_data, selected_value_address);
     save_operand_index(operand_index);
-    ir_builder_.CreateStore(ir_builder_.getInt1(true),
-                            initialized_flag_address);
+    b_.CreateStore(b_.getInt1(true), initialized_flag_address);
 
     // If the initialized_flag is true, call the `select` function to
     // potentially update the selected value and index with the currently
     // visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &ir_builder_);
+    llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &b_);
     const Shape output_shape = ShapeUtil::MakeShape(PRED, {});
     llvm::Value* operand_address =
-        operand_array.EmitArrayElementAddress(operand_index, &ir_builder_);
+        operand_array.EmitArrayElementAddress(operand_index, &b_);
     llvm::Value* select_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
         llvm_ir::PrimitiveTypeToIrType(PRED,
                                        ir_emitter_context_->llvm_module()),
-        "select_return_buffer", &ir_builder_);
+        "select_return_buffer", &b_);
     TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
         *select_and_scatter->select(),
         {selected_value_address, operand_address}, select_return_buffer));
-    llvm::Value* result = ir_builder_.CreateLoad(select_return_buffer);
+    llvm::Value* result = b_.CreateLoad(select_return_buffer);
 
     // If the 'select' function returns false, update the selected value and the
     // index to the currently visiting operand.
-    llvm::Value* cond = ir_builder_.CreateICmpNE(
+    llvm::Value* cond = b_.CreateICmpNE(
         result,
         llvm::ConstantInt::get(llvm_ir::PrimitiveTypeToIrType(
                                    PRED, ir_emitter_context_->llvm_module()),
                                0),
         "boolean_predicate");
     llvm_ir::LlvmIfData if_select_lhs =
-        llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &ir_builder_);
-    ir_builder_.CreateStore(ir_builder_.CreateLoad(operand_address),
-                            selected_value_address);
+        llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_);
+    llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &b_);
+    b_.CreateStore(b_.CreateLoad(operand_address), selected_value_address);
     save_operand_index(operand_index);
 
     // After iterating over the window elements, scatter the source element to
@@ -1979,20 +1966,19 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
     // location is computed by calling the `scatter` function with the source
     // value and the current output value.
     llvm_ir::SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
-                                   &ir_builder_);
+                                   &b_);
     IrArray::Index selected_index(operand_index.GetType());
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* selected_index_address_slot = ir_builder_.CreateInBoundsGEP(
-          selected_index_address, {ir_builder_.getInt32(i)});
-      selected_index.push_back(
-          ir_builder_.CreateLoad(selected_index_address_slot));
+      llvm::Value* selected_index_address_slot =
+          b_.CreateInBoundsGEP(selected_index_address, {b_.getInt32(i)});
+      selected_index.push_back(b_.CreateLoad(selected_index_address_slot));
     }
     llvm::Value* source_value_address =
         GetIrArray(*source, *select_and_scatter)
-            .EmitArrayElementAddress(source_index, &ir_builder_);
+            .EmitArrayElementAddress(source_index, &b_);
     llvm::Value* output_value_address =
         GetIrArray(*select_and_scatter, *select_and_scatter)
-            .EmitArrayElementAddress(selected_index, &ir_builder_);
+            .EmitArrayElementAddress(selected_index, &b_);
     return EmitAtomicOperationForNestedComputation(
         *select_and_scatter->scatter(), output_value_address,
         source_value_address);
@@ -2007,7 +1993,7 @@ Status IrEmitterUnnested::HandleSelectAndScatter(
       static_cast<SequentialThunk*>(LastThunk())->thunks().back().get(),
       ir_emitter_context_->llvm_module());
   return ParallelLoopEmitter(loop_body_emitter, source->shape(),
-                             launch_dimensions, &ir_builder_)
+                             launch_dimensions, &b_)
       .EmitLoop(IrName(select_and_scatter), index_type);
 }
 
@@ -2034,10 +2020,44 @@ Status IrEmitterUnnested::HandleWhile(HloInstruction* xla_while) {
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleRng(HloInstruction* random) {
-  thunk_sequence_->push_back(
-      BuildKernelThunk(random, /*implements_whole_instruction=*/true));
-  return IrEmitter::HandleRng(random);
+Status IrEmitterUnnested::HandleRng(HloInstruction* rng) {
+  // Build the kernel to generate the random numbers.
+  //
+  // Unroll the kernel so that the duplicated computation that calculates the
+  // 128 bit sample can be optimized away by LLVM.
+  thunk_sequence_->emplace_back(
+      BuildKernelThunk(rng, /*implements_whole_instruction=*/false,
+                       ComputeMaxUnrollFactor(rng)));
+  ElementalIrEmitter::HloToElementGeneratorMap operand_to_generator;
+  for (const HloInstruction* operand : rng->operands()) {
+    operand_to_generator[operand] = [=](const llvm_ir::IrArray::Index& index) {
+      return GetIrArray(*operand, *rng).EmitReadArrayElement(index, &b_);
+    };
+  }
+  TF_RETURN_IF_ERROR(EmitTargetElementLoop(
+      *rng, GpuElementalIrEmitter(hlo_module_config_, module_, &b_,
+                                  GetNestedComputer())
+                .MakeElementGenerator(rng, operand_to_generator)));
+  std::unique_ptr<Thunk> rng_thunk = std::move(thunk_sequence_->back());
+  thunk_sequence_->pop_back();
+
+  // Emit a kernel to increment the global state for Philox RNG algorithm.
+  thunk_sequence_->emplace_back(
+      BuildKernelThunk(rng, /*implements_whole_instruction=*/false));
+  llvm_ir::IncrementVariableForPhiloxRngState(1, module_, &b_);
+  std::unique_ptr<Thunk> increment_seed_thunk =
+      std::move(thunk_sequence_->back());
+  thunk_sequence_->pop_back();
+
+  // Build the SequentialThunk for the RNG hlo.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  thunks.reserve(2);
+  thunks.push_back(std::move(rng_thunk));
+  thunks.push_back(std::move(increment_seed_thunk));
+  thunk_sequence_->emplace_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), rng));
+
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
@@ -2049,30 +2069,94 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   auto values = sort->operand_count() > 1 ? sort->operand(1) : nullptr;
+  ShapeIndex keys_shape_index({});
+  ShapeIndex values_shape_index({});
   if (values != nullptr) {
-    // TODO(b/26783907): Also sort the values by their corresponding key.
-    return Unimplemented("Key/Value Sort is not implemented on GPU");
+    keys_shape_index = ShapeIndex({0});
+    values_shape_index = ShapeIndex({1});
   }
+  auto keys_destination = GetAllocationSlice(*sort, keys_shape_index);
 
-  // First copy the operand to the output, so that we can sort in-place.
+  // First copy the operand(s) to the output, so that we can sort in-place.
   // TODO(b/26783907): Share buffer of output and operand when it is possible.
   if (sort->operand(0)->IsConstant()) {
     thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
         /*source_address=*/sort->operand(0)->literal().untyped_data(),
-        /*destination_buffer=*/GetAllocationSlice(*sort),
-        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
+        /*destination_buffer=*/keys_destination,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
+        nullptr));
   } else {
     thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
         /*source_address=*/GetAllocationSlice(*sort->operand(0)),
-        /*destination_buffer=*/GetAllocationSlice(*sort),
-        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->shape()), sort));
+        /*destination_buffer=*/keys_destination,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(0)->shape()),
+        nullptr));
+  }
+  if (values != nullptr) {
+    if (values->IsConstant()) {
+      thunks.push_back(MakeUnique<HostToDeviceCopyThunk>(
+          /*source_address=*/sort->operand(1)->literal().untyped_data(),
+          /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
+          nullptr));
+    } else {
+      thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+          /*source_address=*/GetAllocationSlice(*sort->operand(1)),
+          /*destination_buffer=*/GetAllocationSlice(*sort, values_shape_index),
+          /*mem_size=*/ShapeUtil::ByteSizeOf(sort->operand(1)->shape()),
+          nullptr));
+    }
+  }
+
+  int64 dimension_to_sort = sort->dimensions(0);
+  int64 dimension_to_sort_bound =
+      sort->operand(0)->shape().dimensions(dimension_to_sort);
+  int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
+  auto index_type = b_.getInt64Ty();
+
+  // Naive C++ code for the outer loops:
+  //
+  // for (int64 stage = 0; stage < Log2Ceiling(dimension_to_sort_bound);
+  //     ++stage) {
+  //   int64 first_xor_mask = (1LL << (stage + 1)) - 1;
+  //   SortInPlace(first_xor_mask);
+  //   for (int64 mask = stage - 1; mask >= 0; --mask) {
+  //     int64 later_xor_mask = 1LL << mask;
+  //     SortInPlace(later_xor_mask);
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  for (int64 stage = 0; stage < num_stages; ++stage) {
+    for (int64 mask = stage; mask >= 0; --mask) {
+      thunks.push_back(
+          BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
+      LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
+          sort->operand(0)->shape(), ir_emitter_context_->device_description());
+      UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
+                             ir_emitter_context_->llvm_module());
+
+      llvm::Value* xor_mask;
+      if (mask == stage) {
+        xor_mask = llvm::ConstantInt::get(index_type, (1LL << (stage + 1)) - 1);
+      } else {
+        xor_mask = llvm::ConstantInt::get(index_type, 1LL << mask);
+      }
+
+      TF_RETURN_IF_ERROR(llvm_ir::EmitSortInPlace(
+          dimension_to_sort, GetIrArray(*sort, *sort, keys_shape_index),
+          values != nullptr ? tensorflow::gtl::make_optional<IrArray>(
+                                  GetIrArray(*sort, *sort, values_shape_index))
+                            : tensorflow::gtl::nullopt,
+          IrName(sort), xor_mask, &b_, &launch_dimensions));
+    }
   }
 
-  thunks.push_back(
-      BuildKernelThunk(sort, /*implements_whole_instruction=*/false));
   thunk_sequence_->emplace_back(
       MakeUnique<SequentialThunk>(std::move(thunks), sort));
-  return IrEmitter::HandleSort(sort);
+  return Status::OK();
 }
 
 Status IrEmitterUnnested::HandleTupleSelect(HloInstruction* tuple_select) {
@@ -2228,11 +2312,6 @@ GetHloBufferSlices(const HloInstruction* hlo,
 
   // Adds entries for all subshapes of instr to `slices`.
   auto add_slices_for = [&](const HloInstruction* instr) {
-    // GPU constants don't have buffers; don't bother looking for one.
-    if (instr->IsConstant()) {
-      return;
-    }
-
     ShapeUtil::ForEachSubshape(
         instr->shape(), [&](const Shape& /*shape*/, const ShapeIndex& index) {
           if (slices.count({instr, index})) {
@@ -2294,21 +2373,25 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
 
   // We'll pass a pointer to each of the elements of `buffers` to our kernel, in
   // this order.
-  std::vector<const BufferAllocation*> buffers(buffers_needed.begin(),
-                                               buffers_needed.end());
-  std::sort(buffers.begin(), buffers.end(),
+  std::vector<const BufferAllocation*> non_constant_buffers;
+  c_copy_if(buffers_needed, std::back_inserter(non_constant_buffers),
+            [](const BufferAllocation* allocation) {
+              return !allocation->is_constant();
+            });
+
+  std::sort(non_constant_buffers.begin(), non_constant_buffers.end(),
             [](const BufferAllocation* a, const BufferAllocation* b) {
               return a->index() < b->index();
             });
 
-  llvm::Function* kernel = BuildKernelPrototype(*inst, buffers);
+  llvm::Function* kernel = BuildKernelPrototype(*inst, non_constant_buffers);
 
   // Build a map from a BufferAllocation to the corresponding argument in our
   // kernel.
   std::unordered_map<const BufferAllocation*, llvm::Value*> kernel_args;
   {
     auto arg_it = kernel->arg_begin();
-    auto buffers_it = buffers.begin();
+    auto buffers_it = non_constant_buffers.begin();
     for (; arg_it != kernel->arg_end(); ++arg_it, ++buffers_it) {
       kernel_args[*buffers_it] = arg_it;
     }
@@ -2326,18 +2409,24 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
             << " is found in slice " << slice.ToString() << " at GTE index "
             << gte_index.ToString();
 
-    llvm::Value* loc =
-        ir_builder_.CreateInBoundsGEP(kernel_args.at(slice.allocation()),
-                                      {ir_builder_.getInt64(slice.offset())});
+    llvm::Value* loc;
+    if (slice.allocation()->is_constant()) {
+      loc = ir_emitter_context_->llvm_module()->getGlobalVariable(
+          llvm_ir::AsStringRef(llvm_ir::ConstantBufferAllocationToGlobalName(
+              *slice.allocation())));
+      CHECK_NE(loc, nullptr);
+    } else {
+      loc = b_.CreateInBoundsGEP(kernel_args.at(slice.allocation()),
+                                 {b_.getInt64(slice.offset())});
+    }
 
     // If gte_index is nonempty, we have to dereference `loc` to get to the
     // value we're ultimately interested in.
     llvm::Type* int8_double_pointer =
-        llvm::PointerType::get(ir_builder_.getInt8PtrTy(), /*AddressSpace=*/0);
+        llvm::PointerType::get(b_.getInt8PtrTy(), /*AddressSpace=*/0);
     for (int64 idx : gte_index) {
-      loc = ir_builder_.CreateBitCast(loc, int8_double_pointer);
-      loc = ir_builder_.CreateLoad(
-          ir_builder_.CreateInBoundsGEP(loc, {ir_builder_.getInt64(idx)}));
+      loc = b_.CreateBitCast(loc, int8_double_pointer);
+      loc = b_.CreateLoad(b_.CreateInBoundsGEP(loc, {b_.getInt64(idx)}));
     }
 
     bindings_.BindHloToIrValue(*instr, loc, index);
@@ -2349,12 +2438,12 @@ std::unique_ptr<KernelThunk> IrEmitterUnnested::BuildKernelThunk(
     bindings_.SetTempBufferBase(kernel_args.at(*temp_buffer));
   } else {
     bindings_.SetTempBufferBase(
-        llvm::ConstantPointerNull::get(ir_builder_.getInt8PtrTy()));
+        llvm::ConstantPointerNull::get(b_.getInt8PtrTy()));
   }
 
-  return MakeUnique<KernelThunk>(buffers, llvm_ir::AsString(kernel->getName()),
-                                 implements_whole_instruction ? inst : nullptr,
-                                 unroll_factor);
+  return MakeUnique<KernelThunk>(
+      non_constant_buffers, llvm_ir::AsString(kernel->getName()),
+      implements_whole_instruction ? inst : nullptr, unroll_factor);
 }
 
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildHostToDeviceCopyThunk(
@@ -2591,15 +2680,24 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
   // If the init_value was fused into this reduce we have to generate it first.
   if (fused && init_value_operand->opcode() != HloOpcode::kParameter) {
     CHECK_EQ(HloOpcode::kConstant, init_value_operand->opcode());
-    TF_RETURN_IF_ERROR(HandleConstant(const_cast<HloInstruction*>(init_value)));
+
+    const Literal& literal = init_value_operand->literal();
+    llvm::Constant* initializer =
+        llvm_ir::ConvertLiteralToIrConstant(literal, module_);
+
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        *module_, initializer->getType(),
+        /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, initializer,
+        /*Name=*/"");
+    global_for_const->setAlignment(kConstantBufferAlignBytes);
+    bindings_.BindHloToIrValue(*init_value_operand, global_for_const);
   }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(
                          [=](const IrArray::Index& index) {
                            return GetIrArray(*init_value, *hlo)
-                               .EmitReadArrayElement(index, &ir_builder_);
+                               .EmitReadArrayElement(index, &b_);
                          },
-                         GetIrArray(*hlo, *hlo, index), launch_dimensions,
-                         &ir_builder_)
+                         GetIrArray(*hlo, *hlo, index), launch_dimensions, &b_)
                          .EmitLoop(IrName(hlo)));
 
   // Clean up state left behind by emitting the loop above.  (This is normally
@@ -2710,13 +2808,13 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildWhileThunk(
   HloComputation* condition = hlo->while_condition();
   IrEmitterUnnested ir_emitter_condition(hlo_module_config_, condition,
                                          ir_emitter_context_);
-  TF_CHECK_OK(condition->root_instruction()->Accept(&ir_emitter_condition));
+  TF_CHECK_OK(condition->Accept(&ir_emitter_condition));
 
   // Generate thunk sequence for while 'body'.
   HloComputation* body = hlo->while_body();
   IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
                                     ir_emitter_context_);
-  TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body));
+  TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
   return MakeUnique<WhileThunk>(
       GetAllocationSlice(*condition->root_instruction()),  // cond result
@@ -2734,7 +2832,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildForThunk(
   HloComputation* body = hlo->while_body();
   IrEmitterUnnested ir_emitter_body(hlo_module_config_, body,
                                     ir_emitter_context_);
-  TF_CHECK_OK(body->root_instruction()->Accept(&ir_emitter_body));
+  TF_CHECK_OK(body->Accept(&ir_emitter_body));
 
   return MakeUnique<ForThunk>(loop_limit,
                               ir_emitter_body.ConsumeThunkSequence(), hlo);
@@ -2750,12 +2848,12 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildConditionalThunk(
   HloComputation* true_computation = hlo->true_computation();
   IrEmitterUnnested ir_emitter_true(hlo_module_config_, true_computation,
                                     ir_emitter_context_);
-  TF_CHECK_OK(true_computation->root_instruction()->Accept(&ir_emitter_true));
+  TF_CHECK_OK(true_computation->Accept(&ir_emitter_true));
 
   HloComputation* false_computation = hlo->false_computation();
   IrEmitterUnnested ir_emitter_false(hlo_module_config_, false_computation,
                                      ir_emitter_context_);
-  TF_CHECK_OK(false_computation->root_instruction()->Accept(&ir_emitter_false));
+  TF_CHECK_OK(false_computation->Accept(&ir_emitter_false));
 
   return MakeUnique<ConditionalThunk>(
       GetAllocationSlice(*hlo->operand(0)),
@@ -2783,10 +2881,10 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
                          ir_emitter_context_->llvm_module());
   if (!hlo.IsMultiOutputFusion()) {
     return ParallelLoopEmitter(element_generator, GetIrArray(hlo, hlo),
-                               launch_dimensions, &ir_builder_, unroll_factor)
-        .EmitLoop(IrName(&hlo),
-                  GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(),
-                                        &ir_builder_));
+                               launch_dimensions, &b_, unroll_factor)
+        .EmitLoop(
+            IrName(&hlo),
+            GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_));
   }
 
   // For multioutput fusion, we need to emit each operand and the root.
@@ -2796,25 +2894,24 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
   }
   TF_RETURN_IF_ERROR(
       ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions,
-                          &ir_builder_, unroll_factor)
+                          &b_, unroll_factor)
           .EmitLoop(IrName(&hlo),
                     GetIndexTypeForKernel(
-                        &hlo, launch_dimensions.launch_bound(), &ir_builder_)));
+                        &hlo, launch_dimensions.launch_bound(), &b_)));
 
   std::vector<llvm::Value*> tuple_operand_ptrs;
   for (int64 i = 0; i < output_arrays.size(); ++i) {
     tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
   }
-  ir_builder_.SetInsertPoint(ir_builder_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
-                     module_);
+  b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
+  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &b_, module_);
   return Status::OK();
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
-  CHECK(Thunk::Kind::kKernel == LastThunk()->kind());
+  CHECK_EQ(Thunk::Kind::kKernel, LastThunk()->kind());
   return EmitTargetElementLoopInThunk(hlo, element_generator,
                                       static_cast<KernelThunk*>(LastThunk()));
 }
@@ -2858,14 +2955,14 @@ int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
       output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
           ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
           reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(output_arrays[i].CastToShape(
-          (*output_reduced_shapes)[i], &ir_builder_));
+      output_in_reduced_shape_arrays->push_back(
+          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
     }
   } else {
     output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
         hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(output_arrays[0].CastToShape(
-        (*output_reduced_shapes)[0], &ir_builder_));
+    output_in_reduced_shape_arrays->push_back(
+        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
   }
   return num_outputs;
 }
@@ -2889,8 +2986,8 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     param_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
         param->shape().element_type(),
         Permute({0, 2, 1}, reduced_output_dims)));
-    param_in_reduced_shape_arrays->push_back(param_arrays[id].CastToShape(
-        (*param_reduced_shapes)[id], &ir_builder_));
+    param_in_reduced_shape_arrays->push_back(
+        param_arrays[id].CastToShape((*param_reduced_shapes)[id], &b_));
   }
   return num_params;
 }
@@ -3039,7 +3136,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
         kTileSize);
     const int kNVPTXSharedMemoryAddrSpace = 3;
     auto* tile_base_ptr = new llvm::GlobalVariable(
-        *ir_builder_.GetInsertBlock()->getParent()->getParent(), tile_type,
+        *b_.GetInsertBlock()->getParent()->getParent(), tile_type,
         /*isConstant=*/false, llvm::GlobalValue::PrivateLinkage,
         llvm::UndefValue::get(tile_type),
         llvm_ir::AsStringRef(IrName(hlo, StrCat("tile", id))), nullptr,
@@ -3063,8 +3160,8 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
       c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
   LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
 
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      hlo, launch_dimensions.launch_bound(), &ir_builder_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
@@ -3092,23 +3189,23 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
   llvm::Value* x;
   llvm::Value* y;
   std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &ir_builder_, index_typed_constant(kTileSize), kThreadsPerTile);
+      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
 
   // Calculate the index for the current output tile from block_id.
   const IrArray::Index output_tile_index(
-      GetBlockIdx(&ir_builder_, index_ty, num_tiles),
+      GetBlockIdx(&b_, index_ty, num_tiles),
       ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
                                                output_dims_in_tiles),
-      &ir_builder_);
+      &b_);
 
   // Output tile origin is the index for the first element of the current output
   // tile.
   const IrArray::Index output_tile_origin = [&] {
     IrArray::Index index = output_tile_index;
     for (int i = 1; i < 3; ++i) {
-      index[i] = ir_builder_.CreateMul(output_tile_index[i],
-                                       index_typed_constant(kTileSize),
-                                       "tile_origin." + std::to_string(i));
+      index[i] =
+          b_.CreateMul(output_tile_index[i], index_typed_constant(kTileSize),
+                       "tile_origin." + std::to_string(i));
     }
     return index;
   }();
@@ -3121,16 +3218,15 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
   std::vector<llvm::Value*> output_tile_bounds(3);
   for (int i = 1; i < 3; ++i) {
     // Only last row or column may not have full size.
-    output_tile_bounds[i] = ir_builder_.CreateSelect(
-        ir_builder_.CreateICmpEQ(
-            output_tile_index[i],
-            index_typed_constant(output_dims_in_tiles[i] - 1)),
+    output_tile_bounds[i] = b_.CreateSelect(
+        b_.CreateICmpEQ(output_tile_index[i],
+                        index_typed_constant(output_dims_in_tiles[i] - 1)),
         index_typed_constant(reduced_output_dims[i] -
                              (output_dims_in_tiles[i] - 1) * kTileSize),
         index_typed_constant(kTileSize), "kTileSize");
   }
 
-  KernelSupportLibrary ksl(&ir_builder_, llvm_ir::UnrollMode::kDefaultUnroll);
+  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
 
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
@@ -3139,13 +3235,13 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
           const std::function<void(const IrArray::Index&, llvm::Value*)>&
               emit_elem_function) {
         EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &ir_builder_, y, x,
-            tile_width, tile_height, emit_elem_function);
+            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
+            tile_height, emit_elem_function);
       };
 
   // Adds `addend` to the given `dim` of `index`.
   auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = ir_builder_.CreateAdd(index[dim], addend);
+    index[dim] = b_.CreateAdd(index[dim], addend);
     return index;
   };
   const IrArray::Index input_index =
@@ -3161,19 +3257,17 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
           llvm::Value* shmem_buffer = param_shmem_buffers[id];
           // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
           // global variables, so LLVM can't infer much about it.
-          ir_builder_.CreateStore(
-              input_in_logical_shape.EmitReadArrayElement(index, &ir_builder_,
+          b_.CreateStore(
+              input_in_logical_shape.EmitReadArrayElement(index, &b_,
                                                           "input_element"),
-              ir_builder_.CreateGEP(shmem_buffer,
-                                    {index_typed_constant(0), y_loc, x}));
+              b_.CreateGEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
         }
       });
 
   // Wait for all threads to reach this point, lest we copy a value from tile to
   // output before the other thread copies it from input to tile.
   // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {},
-                               &ir_builder_);
+  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
 
   llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
 
@@ -3187,27 +3281,26 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
         output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
         [&](const IrArray::Index& index, llvm::Value* y_loc) {
           // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer = ir_builder_.CreateLoad(
-              ir_builder_.CreateGEP(param_shmem_buffers[0],
-                                    {ir_builder_.getInt64(0), x, y_loc}),
+          llvm::Instruction* load_from_shmem_buffer = b_.CreateLoad(
+              b_.CreateGEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
               "output_element");
           output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &ir_builder_);
+              index, load_from_shmem_buffer, &b_);
         });
   } else {
     CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
     emit_tiled_elemental_code_with_bounds_check(
         output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
         [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_,
-                                             &ir_builder_, GetNestedComputer());
+          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                             GetNestedComputer());
           FusedIrEmitter fused_emitter(param_arrays, &elem_emitter);
           tiled_param_info.set_y(y_loc);
           fused_emitter.SetTiledParameterInfo(&tiled_param_info);
           TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
           IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
               index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &ir_builder_);
+              &b_);
           const llvm_ir::ElementGenerator& output_generator =
               fused_emitter.GetRootGenerator();
           llvm::Value* output_value =
@@ -3218,12 +3311,11 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
                      output_in_reduced_shape_arrays.size());
             for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
               output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ir_builder_.CreateExtractValue(output_value, i),
-                  &ir_builder_);
+                  index, b_.CreateExtractValue(output_value, i), &b_);
             }
           } else {
             output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &ir_builder_);
+                index, output_value, &b_);
           }
         });
   }
@@ -3234,7 +3326,7 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
     for (int64 i = 0; i < output_arrays.size(); ++i) {
       tuple_operand_ptrs.push_back(output_arrays[i].GetBasePointer());
     }
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), tuple_operand_ptrs, &ir_builder_,
+    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), tuple_operand_ptrs, &b_,
                        module_);
   }
 
@@ -3285,6 +3377,40 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
     return false;
   }
 
+  // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
+  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
+  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
+  // use this, in part because it eats into our L1 cache space.)
+  //
+  // For correctness we need to ensure that we don't make more than 48kb worth
+  // of shmem tiles per block.  And for performance, we'd probably like to use
+  // significantly less, so that we can fit more than one block at a time on a
+  // gpu core.
+  //
+  // We say without benchmarks that we want at least 3 threads/block,
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
+  // which params get the shmem transpose treatment arbitrarily; it's not clear
+  // if there's a Right Choice.
+  //
+  // This is only sound if tiled transposes are the only place where we use
+  // shared memory in fusions.  If in the future other fusile ops use shared
+  // memory, we'll have to adjust this heuristic.
+  constexpr int kMinBlocksPerCore = 3;
+  constexpr int64 kShmemPerCore = 48 * 1024;
+  int64 shmem_used = 0;
+  for (int64 i = 0; i < params_012.size(); ++i) {
+    const HloInstruction* operand = hlo->operand(params_012[i]);
+    shmem_used +=
+        32 * 33 *
+        ShapeUtil::ByteSizeOfPrimitiveType(operand->shape().element_type());
+
+    if (kMinBlocksPerCore * shmem_used > kShmemPerCore) {
+      // Erase this element and everything after it from params_012.
+      params_012.resize(i);
+      break;
+    }
+  }
+
   VLOG(3) << "EmitHlo021Tile Emitting hlo tile 0-2-1" << hlo->ToString();
   thunk_sequence_->emplace_back(
       BuildKernelThunk(hlo, /*implements_whole_instruction=*/true));
@@ -3296,5 +3422,47 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   return true;
 }
 
+Status IrEmitterUnnested::EmitConstantGlobals() {
+  for (const BufferAllocation& allocation :
+       ir_emitter_context_->buffer_assignment().Allocations()) {
+    if (!allocation.is_constant()) {
+      continue;
+    }
+
+    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
+    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
+    llvm::ArrayType* global_type =
+        llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
+    llvm::Constant* initializer =
+        should_emit_initializer
+            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
+            : llvm::ConstantAggregateZero::get(global_type);
+    if (should_emit_initializer) {
+      VLOG(3) << "Emitted initializer for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+    }
+
+    // These globals will be looked up by name by GpuExecutable so we need to
+    // give them an external linkage.  Not all of their uses are visible in the
+    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
+    // preserves their names (like available_externally), we also need to ensure
+    // that they stick around even if they're "unused".
+    //
+    // We may have to be more more clever here in the future if we notice that
+    // we're keeping around too many globals because of their linkage.
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        global_type, /*isConstant=*/should_emit_initializer,
+        llvm::GlobalValue::ExternalLinkage,
+        /*Initializer=*/initializer,
+        llvm_ir::AsStringRef(
+            llvm_ir::ConstantBufferAllocationToGlobalName(allocation)));
+    global_for_const->setAlignment(kConstantBufferAlignBytes);
+    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
+        global_for_const);
+  }
+
+  return Status::OK();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 616d8a2206e5a9666947008879c48f99a022e899..525441990795e160ba0e8facb910d5cc9796c4bb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -92,6 +92,9 @@ class IrEmitterUnnested : public IrEmitter {
       const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
       KernelThunk* thunk);
 
+  // Emits LLVM global variables corresponding to constant instructions.
+  Status EmitConstantGlobals();
+
  private:
   // Builds the appropriate thunk for the instruction hlo and returns the owning
   // pointer to it. The caller needs to make sure `inst` outlives the lifetime
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index ea661b3c2cb2c945297ac2098cd1c4009b2e966d..6fef7208533e83484ee7fdee22528fb4d219272c 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -23,6 +23,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -71,7 +73,6 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
   // In that case, the operand of the reduce needs to have the same shape
   // as the other tuple operands, but also we need to compare the output
   // shapes of the reduces.
-  // TODO(tjoerg): Allow differences in fp precision.
   auto* element_instr_1 = get_element_instr(instr1);
   auto* element_instr_2 = get_element_instr(instr2);
   if (element_instr_1->opcode() == HloOpcode::kReduce &&
@@ -80,8 +81,8 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
     return false;
   }
   // The elementwise output shapes must be the same (including layout).
-  return ShapeUtil::Equal(get_element_shape(element_instr_1),
-                          get_element_shape(element_instr_2));
+  return ShapeUtil::EqualIgnoringFpPrecision(
+      get_element_shape(element_instr_1), get_element_shape(element_instr_2));
 }
 
 namespace {
@@ -107,6 +108,27 @@ bool IsInputFusibleReduction(HloInstruction* instr) {
     return IsReductionToVector(*instr);
   }
 }
+
+// The code emitted for reduction suffers from poor data locality if the layouts
+// of input parameters differ. In such situtations it is beneficial not to fuse.
+// We consider input params with maximum rank only. Params with smaller ranks
+// will be broadcasted and have not been observed to cause data locality issues.
+// TODO(b/110927656): Improve reduce emitters to remove this limitation.
+bool ReduceFriendlyInputLayouts(HloInstruction* instr) {
+  int64 max_rank = 0;
+  const Layout* max_rank_layout;
+  for (HloInstruction* param : instr->fused_parameters()) {
+    if (ShapeUtil::Rank(param->shape()) > max_rank) {
+      max_rank = ShapeUtil::Rank(param->shape());
+      max_rank_layout = &param->shape().layout();
+    }
+  }
+  return c_all_of(instr->fused_parameters(), [&](HloInstruction* param) {
+    return (ShapeUtil::Rank(param->shape()) < max_rank) ||
+           (LayoutUtil::Equal(param->shape().layout(), *max_rank_layout));
+  });
+}
+
 }  // namespace
 
 bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
@@ -142,16 +164,22 @@ bool GpuMultiOutputFusion::LegalToFuse(HloInstruction* instr1,
   if (!MultiOutputFusion::LegalToFuse(instr1, instr2)) {
     return false;
   }
+
   // If we're fusing fusions only do it if the fusion kind matches. Loop fusions
   // merge into bigger loop fusions and input (reduce) fusions become fusions
   // with multiple reduce outputs. We could fuse reduce and loop fusions
   // together too (the result being an input fusion) if we find cases where this
   // improves things.
   CHECK(instr1->opcode() == HloOpcode::kFusion);
-  if (instr2->opcode() == HloOpcode::kFusion) {
-    return instr1->fusion_kind() == instr2->fusion_kind();
+  if ((instr2->opcode() == HloOpcode::kFusion &&
+       instr1->fusion_kind() != instr2->fusion_kind()) ||
+      (instr2->opcode() != HloOpcode::kFusion &&
+       instr1->fusion_kind() == HloInstruction::FusionKind::kLoop)) {
+    return false;
   }
-  return instr1->fusion_kind() != HloInstruction::FusionKind::kLoop;
+
+  // Do this check last, as it may be expensive.
+  return !GpuInstructionFusion::FusionWouldBeTooLarge(instr1, instr2);
 }
 
 bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
@@ -173,29 +201,41 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
   // fusions operands.
   for (HloInstruction* consumer : computation()->MakeInstructionPostOrder()) {
     if (consumer->user_count() == 0) {
+      VLOG(3) << consumer->name() << " has no users.";
       continue;
     }
     if (!IsInputFusibleReduction(consumer)) {
+      VLOG(3) << consumer->name() << " is not an input-fusable reduction.";
       continue;
     }
+    VLOG(3) << consumer->name()
+            << " is a fusion candidate. Looking for fuseable operands.";
 
     auto consumer_operands = consumer->operands();
     for (size_t i = 0; i < consumer_operands.size(); ++i) {
       HloInstruction* producer = consumer_operands[i];
       if (!producer->IsFusable()) {
+        VLOG(3) << producer->name() << " is not fusable.";
         continue;
       }
       const bool is_loop_fusion =
           producer->opcode() == HloOpcode::kFusion &&
           producer->fusion_kind() == HloInstruction::FusionKind::kLoop;
       if (!is_loop_fusion) {
+        VLOG(3) << producer->name() << " is not a loop fusion.";
         continue;
       }
       if (!ShapesCompatibleForFusion(producer, consumer)) {
+        VLOG(3) << producer->name() << " has an incompatible shape.";
+        continue;
+      }
+      if (!ReduceFriendlyInputLayouts(producer)) {
+        VLOG(3) << producer->name() << " has inputs with mixed layouts.";
         continue;
       }
       // If we have already decided to fuse this producer, skip it.
       if (ContainsKey(to_fuse, producer)) {
+        VLOG(3) << producer->name() << " will be fused with another consumer.";
         continue;
       }
       // Do not fuse a producer if the other operands of the fusion are
@@ -204,6 +244,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
             return producer != operand &&
                    reachability()->IsReachable(producer, operand);
           })) {
+        VLOG(3) << producer->name() << " would introduce a cycle when fused.";
         break;
       }
       to_fuse.insert(producer);
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index a6dc635b5296cf42b3f4fe5c405d9cd4660ff850..ec4234b8d9a5da299a9dc574169b0bb5fe6a575f 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -40,7 +41,7 @@ const char kModulePrefix[] = R"(
     scalar_mul_computation {
       scalar_lhs.1 = f32[] parameter(0)
       scalar_rhs.1 = f32[] parameter(1)
-      ROOT mul.1 = f32[] add(scalar_lhs.1, scalar_rhs.1)
+      ROOT mul.1 = f32[] multiply(scalar_lhs.1, scalar_rhs.1)
     })";
 
 TEST_F(MultiOutputFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
@@ -349,5 +350,128 @@ TEST_F(MultiOutputFusionTest, ProducerConsumerFusionDoNotFuseLoopReduceFusion) {
   ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(MultiOutputFusionTest,
+       ProducerConsumerFusionFp16LoopFusionAndReduceFusion) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_select {
+      p1.1 = f16[2,2,2]{2,1,0} parameter(1)
+      c0 = f16[] constant(0)
+      broadcast = f16[2,2,2]{2,1,0} broadcast(f16[] c0), dimensions={}
+      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      p0.1 = f16[2,2,2]{2,1,0} parameter(0)
+      ROOT select = f16[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f16[2,2,2]{2,1,0} p0.1, f16[2,2,2]{2,1,0} broadcast)
+    }
+    fused_reduce {
+      p0.2 = f16[2,2,2]{2,1,0} parameter(0)
+      convert = f32[2,2,2]{2,1,0} convert(p0.2)
+      c1 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(convert, c1), dimensions={2}, to_apply=scalar_add_computation
+      mul = f32[2,2,2]{2,1,0} multiply(convert, convert)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add_computation
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+    ENTRY reduce {
+      p0 = f16[2,2,2]{2,1,0} parameter(0)
+      p1 = f16[2,2,2]{2,1,0} parameter(1)
+      select = f16[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce
+      gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0
+      gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1
+      ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f16[2,2,2]{2,1,0}) tuple(gte1, gte1, select)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::GetTupleElement(), op::GetTupleElement(),
+                              op::GetTupleElement()));
+  const HloInstruction* fusion = root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce(), op::Select()));
+}
+
+TEST_F(MultiOutputFusionTest,
+       ProducerConsumerFusionReduceUnfriendlyLoopFusion) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    mixed_input_layouts_computation {
+      p0.1 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
+      p1.1 = f16[128,1024,32,32]{3,2,1,0} parameter(1)
+      copy = f16[128,1024,32,32]{1,3,2,0} copy(p1.1)
+      c0 = f16[] constant(0)
+      broadcast = f16[128,1024,32,32]{1,3,2,0} broadcast(c0), dimensions={}
+      greater-than = pred[128,1024,32,32]{1,3,2,0} greater-than(copy, broadcast)
+      ROOT root = f16[128,1024,32,32]{1,3,2,0} select(greater-than, p0.1, broadcast)
+    }
+    fused_reduce {
+      p0.2 = f16[128,1024,32,32]{1,3,2,0} parameter(0)
+      convert = f32[128,1024,32,32]{1,3,2,0} convert(p0.2)
+      c0.2 = f32[] constant(0)
+      ROOT reduce = f32[1024]{0} reduce(convert, c0.2), dimensions={0,2,3}, to_apply=scalar_add_computation
+    }
+    ENTRY reduce {
+      p0 = f16[128,1024,32,32]{3,2,1,0} parameter(0)
+      p1 = f16[128,1024,32,32]{1,3,2,0} parameter(1)
+      loop_fusion = f16[128,1024,32,32]{1,3,2,0} fusion(p0, p1), kind=kLoop, calls=mixed_input_layouts_computation
+      reduce_fusion = f32[1024]{0} fusion(loop_fusion), kind=kInput, calls=fused_reduce
+      ROOT root = (f32[1024]{0}, f16[128,1024,32,32]{1,3,2,0}) tuple(reduce_fusion, loop_fusion)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
+// Check that we limit the number of operands to fusions we create.
+TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
+  constexpr int64 kNumParams = 200;
+  ASSERT_GT(kNumParams, GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion);
+
+  // Compute
+  //   p0 * p1,
+  //   p0 * p1 + p1 * p2
+  //   p0 * p1 + p1 * p2 + p2 * p3
+  //   ...
+  // where each of the (pi * pj)'s is represented as a fusion node so that
+  // multi-output fusion will pay attention to it.
+  auto module = CreateNewModule();
+  HloComputation::Builder b(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
+
+  std::vector<HloInstruction*> params;
+  for (int64 i = 0; i < kNumParams; ++i) {
+    params.push_back(
+        b.AddInstruction(HloInstruction::CreateParameter(i, shape, "p")));
+  }
+
+  // Creates a fusion node that calculates x*y.
+  auto make_fusion = [&](HloInstruction* x, HloInstruction* y) {
+    HloComputation::Builder sub_builder("subcomp");
+    auto* p0 = sub_builder.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "p"));
+    auto* p1 = sub_builder.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "p"));
+    sub_builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1));
+    HloComputation* subcomp =
+        module->AddEmbeddedComputation(sub_builder.Build());
+    return HloInstruction::CreateFusion(
+        shape, HloInstruction::FusionKind::kLoop, {x, y}, subcomp);
+  };
+
+  auto* sum = b.AddInstruction(make_fusion(params[0], params[1]));
+  for (int64 i = 2; i < kNumParams; ++i) {
+    sum = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, sum,
+        b.AddInstruction(make_fusion(params[i - 1], params[i]))));
+  }
+  auto computation = module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  for (const HloInstruction* instr : computation->instructions()) {
+    EXPECT_LE(instr->operand_count() + ShapeUtil::SubshapeCount(instr->shape()),
+              GpuInstructionFusion::kMaxOperandsAndOutputsPerFusion)
+        << instr->ToString();
+  }
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index ad29862d83edaac2d34308fa63d6fc2fd7417907..6d8996dac18423a18cc3b70da1a5aa081011dc22 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -540,11 +540,13 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   // temporary buffers are required to run the computation.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<BufferAssignment> buffer_assignment,
-      BufferAssigner::Run(module.get(), hlo_schedule->ConsumeHloOrdering(),
-                          BufferSizeBytesFunction(),
-                          /*color_alignment=*/[](LogicalBuffer::Color) {
-                            return kCudaMallocAlignBytes;
-                          }));
+      BufferAssigner::Run(
+          module.get(), hlo_schedule->ConsumeHloOrdering(),
+          BufferSizeBytesFunction(),
+          /*color_alignment=*/
+          [](LogicalBuffer::Color) { return kXlaAllocatedBufferAlignBytes; },
+          /*allow_input_output_aliasing=*/false,
+          /*allocate_buffers_for_constants=*/true));
   // BufferAssignment::Stats::ToString() and BufferAssignment::ToString()
   // include headers, so no need for us to print them ourselves.
   XLA_VLOG_LINES(1, buffer_assignment->GetStats().ToString());
@@ -565,6 +567,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   HloComputation* entry_computation = module->entry_computation();
   IrEmitterUnnested ir_emitter(module->config(), entry_computation,
                                &ir_emitter_context);
+
+  TF_RETURN_IF_ERROR(ir_emitter.EmitConstantGlobals());
+
   {
     XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - IR emission");
     TF_RETURN_IF_ERROR(entry_computation->Accept(&ir_emitter));
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index cd833ec7bd858aabee84ac306d198e80eb112506..3838fee674566196e10ddd98462c1a1aa7835e1a 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -32,27 +32,27 @@ namespace gpu {
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     BodyEmitter body_emitter, const Shape& shape,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(body_emitter, shape, ir_builder),
+    : LoopEmitter(body_emitter, shape, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(target_element_generator, target_arrays, ir_builder),
+    : LoopEmitter(target_element_generator, target_arrays, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
 ParallelLoopEmitter::ParallelLoopEmitter(
     const llvm_ir::ElementGenerator& target_element_generator,
     const llvm_ir::IrArray& target_array,
-    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+    const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
     int unroll_factor)
-    : LoopEmitter(target_element_generator, target_array, ir_builder),
+    : LoopEmitter(target_element_generator, target_array, b),
       launch_dimensions_(launch_dimensions),
       unroll_factor_(unroll_factor) {}
 
@@ -74,29 +74,27 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   CHECK_NE(index_type, nullptr);
   std::vector<llvm_ir::IrArray::Index> array_indices;
   llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, ir_builder_);
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
                             static_cast<llvm::Instruction*>(block_id));
-  block_id = ir_builder_->CreateZExtOrTrunc(block_id, index_type, "block_id");
+  block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
 
   // Per the PTX documentation:
   //   "It is guaranteed that [...] 0  <=  %tid.x <  %ntid.x"
   //
   // %ntid.x is currently specified as 1024.
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, ir_builder_);
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
                             static_cast<llvm::Instruction*>(thread_id));
-  thread_id =
-      ir_builder_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
-
-  llvm::Value* linear_index_base = ir_builder_->CreateAdd(
-      ir_builder_->CreateMul(
-          block_id,
-          llvm::ConstantInt::get(index_type,
-                                 launch_dimensions_.threads_per_block()),
-          "",
-          /*HasNUW=*/true, /*HasNSW=*/true),
+  thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
+
+  llvm::Value* linear_index_base = b_->CreateAdd(
+      b_->CreateMul(block_id,
+                    llvm::ConstantInt::get(
+                        index_type, launch_dimensions_.threads_per_block()),
+                    "",
+                    /*HasNUW=*/true, /*HasNSW=*/true),
       thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true);
 
   // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
@@ -109,41 +107,41 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(
   // conditions in the same basic block as their operands.
   llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::assume,
-      {ir_builder_->CreateICmpULT(
+      {b_->CreateICmpULT(
           linear_index_base,
           llvm::ConstantInt::get(index_type,
                                  launch_dimensions_.threads_per_block() *
                                      launch_dimensions_.block_count()),
           "linear_index_in_range")},
-      {}, ir_builder_);
+      {}, b_);
 
   if (unroll_factor_ > 1) {
-    linear_index_base = ir_builder_->CreateMul(
+    linear_index_base = b_->CreateMul(
         linear_index_base, llvm::ConstantInt::get(index_type, unroll_factor_),
         "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
   }
 
-  array_indices.emplace_back(linear_index_base, shape_, ir_builder_);
+  array_indices.emplace_back(linear_index_base, shape_, b_);
   for (int i = 1; i < unroll_factor_; ++i) {
-    llvm::Value* linear_index = ir_builder_->CreateAdd(
-        linear_index_base, llvm::ConstantInt::get(index_type, i),
-        "linear_index",
-        /*HasNUW=*/true, /*HasNSW=*/true);
-    array_indices.emplace_back(linear_index, shape_, ir_builder_);
+    llvm::Value* linear_index =
+        b_->CreateAdd(linear_index_base, llvm::ConstantInt::get(index_type, i),
+                      "linear_index",
+                      /*HasNUW=*/true, /*HasNSW=*/true);
+    array_indices.emplace_back(linear_index, shape_, b_);
   }
 
   auto if_in_bounds = llvm_ir::EmitIfThenElse(
-      ir_builder_->CreateICmpULT(
+      b_->CreateICmpULT(
           linear_index_base,
           llvm::ConstantInt::get(index_type, ShapeUtil::ElementsIn(shape_))),
-      llvm_ir::IrName(loop_name, "in_bounds"), ir_builder_, false);
+      llvm_ir::IrName(loop_name, "in_bounds"), b_, false);
 
   // Set exit_bb_ to the exit block of the if structure.
   exit_bb_ = if_in_bounds.after_block;
   CHECK_NE(nullptr, exit_bb_);
 
   // Set IR builder insertion point to the body of the if structure.
-  llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
+  llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, b_);
 
   return array_indices;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 302e1bf1bc8e90f2eebd838f156a1552e86185ac..b82a23419df08cafdc69b6d2f14528484b95dc73 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -34,13 +34,13 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   // The meanings of other parameters are the same as LoopEmitter.
   ParallelLoopEmitter(BodyEmitter body_emitter, const Shape& shape,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
+                      llvm::IRBuilder<>* b, int unroll_factor = 1);
   // Constructs a ParallelLoopEmitter from an element generator that generates
   // each element of the given target array.
   ParallelLoopEmitter(const llvm_ir::ElementGenerator& target_element_generator,
                       const llvm_ir::IrArray& target_array,
                       const LaunchDimensions& launch_dimensions,
-                      llvm::IRBuilder<>* ir_builder, int unroll_factor = 1);
+                      llvm::IRBuilder<>* b, int unroll_factor = 1);
 
   // Constructs a loop emitter for a loop that generates on element of each of N
   // arrays on each iteration.
@@ -50,7 +50,7 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ParallelLoopEmitter(
       const llvm_ir::ElementGenerator& target_element_generator,
       tensorflow::gtl::ArraySlice<llvm_ir::IrArray> target_arrays,
-      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* ir_builder,
+      const LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b,
       int unroll_factor = 1);
 
   ParallelLoopEmitter(const ParallelLoopEmitter&) = delete;
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
index e4cfc6999f2da04dd7e7a34d854fdb3d75b8bfc6..0806dd51614f4d2da12f3fbbc9fb98df5273d5c8 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment.cc
@@ -33,13 +33,13 @@ int StreamAssignment::StreamNumberForHlo(const HloInstruction& hlo) const {
 }
 
 void StreamAssignment::AssignStreamToHlo(const HloInstruction* hlo,
-                                         int stream_no) {
-  CHECK_GE(stream_no, 0);
-  if (stream_no >= stream_count_) {
-    stream_count_ = stream_no + 1;
+                                         int stream_num) {
+  CHECK_GE(stream_num, 0);
+  if (stream_num >= stream_count_) {
+    stream_count_ = stream_num + 1;
   }
-  InsertOrDie(&hlo_to_stream_number_, hlo, stream_no);
-  VLOG(2) << "Assign stream #" << stream_no << " to " << hlo->ToString();
+  InsertOrDie(&hlo_to_stream_number_, hlo, stream_num);
+  VLOG(2) << "Assign stream #" << stream_num << " to " << hlo->ToString();
 }
 
 namespace {
@@ -51,6 +51,12 @@ bool CanRunConcurrently(const HloInstruction& a, const HloInstruction& b,
   return !reachability.IsConnected(&a, &b);
 }
 
+constexpr int kInvalidStreamNum = -1;
+//  Returns true iff `stream_num` is an invalid stream number.
+inline bool IsStreamNumValid(int stream_num) {
+  return stream_num != kInvalidStreamNum;
+}
+
 // Returns which existing stream to assign to `hlo`, or -1 if a stream is not
 // needed. `stream_assignment` is the existing stream assignment for all
 // instructions topologically before `hlo`. `seen_gemms` contains all GEMMs that
@@ -62,7 +68,7 @@ int ComputeStreamToAssign(
   if (hlo.opcode() == HloOpcode::kParameter ||
       hlo.opcode() == HloOpcode::kConstant) {
     // kParameter and kConstant do not need a thunk.
-    return -1;
+    return kInvalidStreamNum;
   }
 
   if (hlo.GetModule()
@@ -75,17 +81,17 @@ int ComputeStreamToAssign(
   if (!ImplementedAsGemm(hlo)) {
     // If `hlo` is not implemented as a GEMM, keep it close to its operands to
     // avoid excessive synchronization.
-    int stream_no = -1;
+    int stream_num = -1;
     for (const auto* operand : hlo.operands()) {
       if (stream_assignment.HasStreamAssigned(*operand)) {
-        stream_no =
-            std::max(stream_no, stream_assignment.StreamNumberForHlo(*operand));
+        stream_num = std::max(stream_num,
+                              stream_assignment.StreamNumberForHlo(*operand));
       }
     }
-    if (stream_no == -1) {
-      stream_no = 0;
+    if (!IsStreamNumValid(stream_num)) {
+      stream_num = 0;
     }
-    return stream_no;
+    return stream_num;
   }
 
   // Assign different streams to concurrent GEMMs. The code below uses a
@@ -94,17 +100,17 @@ int ComputeStreamToAssign(
   // `hlo` a different stream.
   std::set<int> forbidden_stream_numbers;
   for (const auto* seen_gemm : seen_gemms) {
-    int stream_no = stream_assignment.StreamNumberForHlo(*seen_gemm);
-    if (!forbidden_stream_numbers.count(stream_no) &&
+    int stream_num = stream_assignment.StreamNumberForHlo(*seen_gemm);
+    if (!forbidden_stream_numbers.count(stream_num) &&
         CanRunConcurrently(*seen_gemm, hlo, reachability)) {
-      forbidden_stream_numbers.insert(stream_no);
+      forbidden_stream_numbers.insert(stream_num);
     }
   }
 
-  for (int stream_no = 0; stream_no < stream_assignment.StreamCount();
-       ++stream_no) {
-    if (!forbidden_stream_numbers.count(stream_no)) {
-      return stream_no;
+  for (int stream_num = 0; stream_num < stream_assignment.StreamCount();
+       ++stream_num) {
+    if (!forbidden_stream_numbers.count(stream_num)) {
+      return stream_num;
     }
   }
   return stream_assignment.StreamCount();
@@ -118,11 +124,27 @@ std::unique_ptr<StreamAssignment> AssignStreams(const HloModule& module) {
   std::unique_ptr<HloReachabilityMap> reachability =
       computation.ComputeReachability();
   std::vector<const HloInstruction*> seen_gemms;
+  // The execution of different RNG Hlo instructions in the same module updates
+  // a common global variable. To avoid a race condition, we simply assign all
+  // RNG kernels to the same stream to make them run sequentially.
+  //
+  // TODO(b/111791052): If we remove such a common variable, we will need to
+  // clean up the code here.
+  int stream_num_for_rng = kInvalidStreamNum;
   for (const auto* hlo : computation.MakeInstructionPostOrder()) {
-    int stream_no = ComputeStreamToAssign(*hlo, *stream_assignment,
-                                          *reachability, seen_gemms);
-    if (stream_no != -1) {
-      stream_assignment->AssignStreamToHlo(hlo, stream_no);
+    // If we ever enable fusion of RNG instructions, we will need to extend this
+    // code to look inside a fused instruction.
+    int stream_num = (hlo->opcode() == HloOpcode::kRng &&
+                      IsStreamNumValid(stream_num_for_rng))
+                         ? stream_num_for_rng
+                         : ComputeStreamToAssign(*hlo, *stream_assignment,
+                                                 *reachability, seen_gemms);
+    if (IsStreamNumValid(stream_num)) {
+      stream_assignment->AssignStreamToHlo(hlo, stream_num);
+      if (hlo->opcode() == HloOpcode::kRng &&
+          !IsStreamNumValid(stream_num_for_rng)) {
+        stream_num_for_rng = stream_num;
+      }
     }
     if (ImplementedAsGemm(*hlo)) {
       seen_gemms.push_back(hlo);
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index a50ddf6ac63c7fa7ccace94bc7f40f438aedccf8..f313a9f1729a4b70ae4092287fc296b5eaa572d7 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -20,10 +20,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-using stream_executor::dnn::DataLayout;
-using stream_executor::dnn::DataLayoutString;
-using stream_executor::dnn::FilterLayout;
-using stream_executor::dnn::FilterLayoutString;
+using se::dnn::DataLayout;
+using se::dnn::DataLayoutString;
+using se::dnn::FilterLayout;
+using se::dnn::FilterLayoutString;
 
 StatusOr<std::tuple<Layout, Layout, Layout>>
 StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 39a6a38d001f502b2abb8de6efe2ce623b478c71..5c5e7f2a56f40782362c28b7d2c5b4baeab20112 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
@@ -29,14 +30,13 @@ namespace gpu {
 // layouts.
 StatusOr<std::tuple<Layout, Layout, Layout>>
 StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
-                                      stream_executor::dnn::DataLayout input,
-                                      stream_executor::dnn::FilterLayout filter,
-                                      stream_executor::dnn::DataLayout output);
+                                      se::dnn::DataLayout input,
+                                      se::dnn::FilterLayout filter,
+                                      se::dnn::DataLayout output);
 
 // Returns (input, filter, output) StreamExecutor layouts given the XLA layouts.
-StatusOr<std::tuple<stream_executor::dnn::DataLayout,
-                    stream_executor::dnn::FilterLayout,
-                    stream_executor::dnn::DataLayout>>
+StatusOr<
+    std::tuple<se::dnn::DataLayout, se::dnn::FilterLayout, se::dnn::DataLayout>>
 XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums,
                                       const Layout& input, const Layout& filter,
                                       const Layout& output);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4fad3f46cf953945e4f395e751e5ba76db97ecc4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -0,0 +1,223 @@
+# Description: GPU-specific XLA tests. For example, codegen tests that
+# verify the IR emitted.
+#
+# TODO(jlebar): None of these tests actually use the GPU, so they should not
+# need to run on machines with GPUs present.
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = [":friends"])
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+cc_library(
+    name = "gpu_codegen_test",
+    testonly = True,
+    srcs = ["gpu_codegen_test.cc"],
+    hdrs = ["gpu_codegen_test.h"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_copy_test",
+    srcs = ["gpu_copy_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_ftz_test",
+    srcs = ["gpu_ftz_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_index_test",
+    srcs = ["gpu_index_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_infeed_test",
+    srcs = ["infeed_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_kernel_tiling_test",
+    srcs = ["gpu_kernel_tiling_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_ldg_test",
+    srcs = ["gpu_ldg_test.cc"],
+    tags = ["requires-gpu-sm35"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_noalias_test",
+    srcs = ["gpu_noalias_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_fusion_test",
+    srcs = ["gpu_fusion_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_unrolling_test",
+    srcs = ["gpu_unrolling_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "gpu_alignment_test",
+    testonly = True,
+    srcs = ["gpu_alignment_test.cc"],
+    tags = [
+        "requires-gpu-sm35",
+    ],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
+        "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..672c68e59b59dff19f0c5575db26dea455c45053
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuAlignmentTest : public GpuCodegenTest {};
+
+TEST_F(GpuAlignmentTest, Test) {
+  const char* hlo_string = R"(
+HloModule GpuAlignmentTest
+
+ENTRY main {
+  zero = f32[] constant(0)
+  tok = token[] after-all()
+  a = f32[100] parameter(0)
+  b_tup = (f32[200], token[]) infeed(tok)
+  b = f32[200] get-tuple-element(b_tup), index=0
+  a_padded = f32[150] pad(a, zero), padding=0_50
+  b_sliced = f32[150] slice(b), slice={[0:150]}
+  ROOT c = f32[150] add(a_padded, b_sliced)
+}
+)";
+
+  CompileAndVerifyIr(hlo_string, R"(
+CHECK: @fusion(i8* align 64 dereferenceable(600) %alloc0, i8* align 16 dereferenceable(400) %alloc1, i8* align 64 dereferenceable(864) %temp_buf)
+)");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b8415fe9106137e588f345a3492f93e46aeb5b6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+namespace gpu {
+
+std::unique_ptr<HloModule> GpuCodegenTest::CreateNewModuleWithFTZ(bool ftz) {
+  HloModuleConfig config;
+  auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
+  debug_options.set_xla_gpu_ftz(ftz);
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  // TODO(b/38354253): Change tests to use Parameters instead of Constants.
+  debug_options.add_xla_disable_hlo_passes("constant_folding");
+  config.set_debug_options(debug_options);
+
+  return MakeUnique<HloModule>(TestName(), config);
+}
+
+void GpuCodegenTest::CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+                                         const string& pattern) {
+  std::unique_ptr<Executable> executable =
+      std::move(CompileToExecutable(std::move(hlo_module)).ValueOrDie());
+  string ptx_str =
+      std::string(static_cast<GpuExecutable*>(executable.get())->ptx());
+  StatusOr<bool> filecheck_result = RunFileCheck(ptx_str, pattern);
+  ASSERT_TRUE(filecheck_result.ok());
+  EXPECT_TRUE(filecheck_result.ValueOrDie());
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4a3573babb7ed746504c1466f85b582aa4d044f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace gpu {
+
+// Tests that verify IR or PTX emitted by the GPU backend is as expected.
+class GpuCodegenTest : public LlvmIrGenTestBase {
+ protected:
+  // Like HloTestBase::CreateNewModule(), with a flag for configuring the ftz
+  // option.
+  std::unique_ptr<HloModule> CreateNewModuleWithFTZ(bool ftz);
+
+  // Compiles the given HLO module to PTX and verifies the PTX matches the given
+  // FileCheck pattern.  (See http://llvm.org/docs/CommandGuide/FileCheck.html).
+  void CompileAndVerifyPtx(std::unique_ptr<HloModule> hlo_module,
+                           const string& pattern);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_TESTS_GPU_CODEGEN_TEST_H_
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ce69e058e64aab1f3c292b2ad7c7b529d4666b35
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuCopyTest : public GpuCodegenTest {};
+
+// The GPU backend should not emit a copy kernel for the kCopy instruction in
+// this test. Instead, it should generate a CopyThunk which invokes cuMemcpy at
+// runtime.
+TEST_F(GpuCopyTest, UseMemcpy) {
+  HloComputation::Builder builder(TestName());
+
+  std::unique_ptr<Literal> literal =
+      LiteralUtil::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  HloInstruction* constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(literal)));
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      constant->shape(), HloOpcode::kCopy, constant));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  // There should not be any kernel prefixed "copy".
+  CompileAndVerifyIr(std::move(hlo_module), "; CHECK-NOT: define void @_copy",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..177b94934c7f519172508b5cc6e088f908401193
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+
+// Check that the ftz (flush denormals to zero) flag is reflected in PTX as
+// expected.
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFtzTest : public GpuCodegenTest {
+ public:
+  explicit GpuFtzTest(bool ftz) : ftz_(ftz) {}
+
+  // Creates an HLO module that performs the given binary operation on some
+  // data.
+  std::unique_ptr<HloModule> CreateBinaryOpModule(HloOpcode op) {
+    HloComputation::Builder builder(TestName());
+
+    Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+        F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/0, param_shape, "x"));
+    HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/1, param_shape, "y"));
+    builder.AddInstruction(HloInstruction::CreateBinary(param_shape, op, x, y));
+
+    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    hlo_module->AddEntryComputation(builder.Build());
+    return hlo_module;
+  }
+
+  // Creates an HLO module that performs the given unary operation on some data.
+  std::unique_ptr<HloModule> CreateUnaryOpModule(HloOpcode op) {
+    HloComputation::Builder builder(TestName());
+
+    Shape param_shape = ShapeUtil::MakeShapeWithLayout(
+        F32, /*dimensions=*/{100, 100}, /*minor_to_major=*/{1, 0});
+    HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
+        /* parameter_number=*/0, param_shape, "x"));
+    builder.AddInstruction(HloInstruction::CreateUnary(param_shape, op, x));
+
+    auto hlo_module = CreateNewModuleWithFTZ(ftz_);
+    hlo_module->AddEntryComputation(builder.Build());
+    return hlo_module;
+  }
+
+  bool ftz_;
+};
+
+class GpuFtzEnabledTest : public GpuFtzTest {
+ public:
+  GpuFtzEnabledTest() : GpuFtzTest(/*ftz=*/true) {}
+};
+
+class GpuFtzDisabledTest : public GpuFtzTest {
+ public:
+  GpuFtzDisabledTest() : GpuFtzTest(/*ftz=*/false) {}
+};
+
+// Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
+TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
+  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+    CHECK-NOT: mul.f32
+    CHECK: mul.ftz.f32
+    CHECK-NOT: mul.f32
+  )");
+}
+TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
+  CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
+    CHECK-NOT: mul.ftz.f32
+    CHECK: mul.f32
+    CHECK-NOT: mul.ftz.f32
+  )");
+}
+
+// In NVPTX, exp(float) is implemented in libdevice, and consults __nvvm_reflect
+// to determine whether or not ftz is enabled.  The implementation uses two
+// calls to ex2.approx.  When ftz is on, we get two calls to the ftz version;
+// when ftz is off, we get one call to the ftz version and one call to the
+// regular version.
+TEST_F(GpuFtzEnabledTest, ExpFtz) {
+  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+    CHECK-NOT: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK:     ex2.approx.ftz.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK-NOT: ex2.approx.ftz.f32
+  )");
+}
+
+TEST_F(GpuFtzDisabledTest, ExpFtz) {
+  CompileAndVerifyPtx(CreateUnaryOpModule(HloOpcode::kExp), R"(
+    CHECK-NOT: ex2.approx.f32
+    CHECK-DAG: ex2.approx.ftz.f32
+    CHECK-DAG: ex2.approx.f32
+    CHECK-NOT: ex2.approx.f32
+    CHECK-NOT: ex2.approx.ftz.f32
+  )");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..674b436a8e3135a5dfe3731647897696bf1321cd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuFusionTest : public GpuCodegenTest {};
+
+TEST_F(GpuFusionTest, FusedReshape) {
+  const char* hlo_text = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1.param_1 = f32[4,1]{1,0} parameter(1)
+      reshape = f32[4,1]{1,0} reshape(p0.param_0)
+      ROOT add = f32[4,1] add(reshape, p1.param_1)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[4,1,1]{2,1,0} parameter(0)
+      p1 = f32[4,1]{1,0} parameter(1)
+      ROOT fusion = f32[4,1]{1,0} fusion(p0, p1), kind=kLoop,
+                                                  calls=fused_computation
+    }
+)";
+
+  CompileAndVerifyIr(hlo_text,
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: }
+      )");
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5958165eff21d82faf821213e50fe30a11059a4
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+// This file tests the index expressions used to reference source tensors. When
+// the destination tensor and source tensor have compatible shapes, the linear
+// index is used to access the source tensor. Otherwise, dimensional indices
+// computed from the linear index are used to access the source tensor.
+
+class GpuIndexTest : public GpuCodegenTest {};
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
+  HloComputation::Builder builder(TestName());
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {5, 7, 2});
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* param_y = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "y"));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(builder.Build());
+
+  // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshape) {
+  HloModuleConfig config;
+  config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY CompatibleUseLinearIndexWithReshape {
+      x = f32[5,7,2]{2,1,0} parameter(0)
+      y = f32[5,14]{1,0} parameter(1)
+      reshape = f32[5,7,2]{2,1,0} reshape(y)
+      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, reshape)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK-NOT: udiv
+; CHECK-NOT: urem
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithReshapeAndBroadcast) {
+  HloModuleConfig config;
+  config.set_debug_options(HloTestBase::GetDebugOptionsForTest());
+  auto module = ParseHloString(R"(
+    HloModule test_module
+
+    ENTRY CompatibleUseLinearIndexWithReshape {
+      x = f32[5,7,2]{2,1,0} parameter(0)
+      y = f32[14]{0} parameter(1)
+      reshape = f32[7,2]{1,0} reshape(y)
+      broadcast = f32[5,7,2]{2,1,0} broadcast(reshape), dimensions={1,2}
+      ROOT gte = pred[5,7,2]{2,1,0} greater-than-or-equal-to(x, broadcast)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check the optimized IR reuses the linear index by calculating modulo 14.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK: %[[urem1:.*]] = urem i{{[0-9]*}} %[[linear_index:.*]], 14
+; CHECK: %[[bitcast:.*]] = bitcast i8 addrspace(1)* %[[alloc:.*]] to float addrspace(1)*
+; CHECK: %[[idx1:.*]] = zext i{{[0-9]*}} %[[urem1]] to i64
+; CHECK: getelementptr inbounds float, float addrspace(1)* %[[bitcast]], i64 %[[idx1]]
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuIndexTest, CompatibleUseLinearIndexWithSizeOneDimensions) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  config.set_debug_options(debug_options);
+
+  auto module = ParseHloString(R"(
+    HloModule  test_module
+
+    ENTRY CompatibleUseLinearIndexWithSizeOneDimensions  {
+      x = f32[1,1024,1,256]{3,2,1,0} parameter(0)
+      ROOT y = f16[1,1024,1,256]{2,3,1,0} convert(x)
+    })",
+                               config)
+                    .ValueOrDie();
+
+  // Check that the unoptimized IR reuses the linear index.
+  CompileAndVerifyIr(std::move(module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: udiv i32 %[[linear_index:.*]], 262144
+; CHECK: %[[ld_addr:.*]] = getelementptr inbounds float, float* {{.*}}, i32 %[[linear_index]]
+; CHECK: load float, float* %[[ld_addr]]
+; CHECK: %[[st_addr:.*]] = getelementptr inbounds half, half* {{.*}}, i32 %[[linear_index]]
+; CHECK: store half {{.*}}, half* %[[st_addr]]
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cca35316f0c472d2a17c466f8cd1af7f22575a8b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuKernelTilingTest : public GpuCodegenTest {
+ protected:
+  GpuKernelTilingTest() {
+    auto debug_options = HloTestBase::GetDebugOptionsForTest();
+    config_.set_debug_options(debug_options);
+    // Disable layout_assignment to use the preassigned layouts.
+    debug_options.add_xla_disable_hlo_passes("layout_assignment");
+  }
+  HloModuleConfig config_;
+};
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithProperDimensionsTiled) {
+  const char *const kHloString = R"(
+    HloModule unnested_transpose_1
+
+    ENTRY unnested_transpose_1 {
+      para0 = f16[32,3,64]{2,1,0} parameter(0)
+      ROOT copy1 = f16[32,3,64]{1,0,2} copy(para0)
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @copy
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, UnnestedTransposeWithSmallDimensionsNotTiled) {
+  const char *const kHloString = R"(
+    HloModule unnested_transpose_2
+
+    ENTRY unnested_transpose_2 {
+      para0 = f16[2,3,64]{2,1,0} parameter(0)
+      ROOT copy1 = f16[2,3,64]{1,0,2} copy(para0)
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @copy
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_1
+    fused_computation.1 {
+      param0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+      copy = f32[4,5,6,7,8]{2,1,4,3,0} copy(param0)
+      ROOT convert = f16[4,5,6,7,8]{2,1,4,3,0} convert(copy)
+    }
+
+    ENTRY copy_in_fusion_run_without_hlo_passes {
+      para0 = f32[4,5,6,7,8]{4,3,2,1,0} parameter(0)
+      ROOT fusion.1 = f16[4,5,6,7,8]{2,1,4,3,0} fusion(para0), kind=kLoop,
+        calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_1
+    fused_computation.1 {
+      param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      param1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+      copy0 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+      copy1 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+      ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        tuple(copy0, copy1)
+    }
+
+    ENTRY multiple_output_fusion_1 {
+      para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      para1 = f16[8,31,31,65]{3,2,1,0} parameter(1)
+      ROOT fusion.1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+
+  // Check that the kernel runs correctly.
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.0}));
+}
+
+TEST_F(GpuKernelTilingTest,
+       MultipleOutputFusionWithTwoPossibleTransposesNotTiled) {
+  const char *const kHloString = R"(
+    HloModule multiple_output_fusion_2
+    fused_computation.1 {
+      param0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      param1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+      copy2 = f16[8,31,31,65]{2,1,3,0} copy(param0)
+      copy3 = f16[8,31,31,65]{2,1,3,0} copy(param1)
+      ROOT tuple1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        tuple(copy2, copy3)
+    }
+
+    ENTRY multiple_output_fusion_2 {
+      para0 = f16[8,31,31,65]{3,2,1,0} parameter(0)
+      para1 = f16[8,31,31,65]{1,3,2,0} parameter(1)
+      ROOT fusion1 = (f16[8,31,31,65]{2,1,3,0}, f16[8,31,31,65]{2,1,3,0})
+        fusion(para0,para1), kind=kLoop, calls=fused_computation.1
+    })";
+
+  // Check that a call to llvm.nvvm.barrier0 is not generated.
+  auto hlo_module = ParseHloString(kHloString, config_).ValueOrDie();
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: define void @fusion
+; CHECK-NOT: tail call void @llvm.nvvm.barrier0()
+; CHECK: }
+)",
+                     /*match_optimized_ir=*/true);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c9ae7bada5e7545b558b6fcb872ece60850cbe9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests that we emit ld.global.nc (the PTX instruction corresponding to CUDA's
+// __ldg builtin) for reads of buffers that don't change during a kernel's
+// execution.
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuLdgTest : public GpuCodegenTest {};
+
+// Parameters are never overwritten, so parameter reads should get ld.global.nc
+// reads.
+TEST_F(GpuLdgTest, LdgForParamRead) {
+  HloComputation::Builder builder(TestName());
+
+  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK-NOT: ld.global.f32
+    CHECK: ld.global.nc.f32
+  )");
+}
+
+// Check that reading a buffer produced by a non-parameter HLO also results in
+// ld.global.nc, if that buffer isn't modified within the instruction that reads
+// it.
+TEST_F(GpuLdgTest, LdgForNonParamRead) {
+  HloComputation::Builder builder(TestName());
+
+  auto shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+  HloInstruction* square = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, add));
+  builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK: {
+    CHECK-NOT: ld.global.f32
+    CHECK: ld.global.nc.f32
+    CHECK: }
+  )");
+}
+
+// Check that reading a buffer that's modified in-place does not produce
+// ld.global.nc.
+//
+// We do this by creating a reduce that feeds into a sin.  We don't currently
+// fuse sin into reduce, and the sin is elementwise, so it reuses its input
+// buffer as its output.
+//
+// It seems like a fair bet that we won't start fusing sin into the output of
+// reduce in the foreseeable future.  But if that turns out to be wrong, I give
+// you, future reader, permission to delete this test.
+TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
+  auto hlo_module = CreateNewModule();
+  HloComputation::Builder builder(TestName());
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        hlo_module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape,
+      builder.AddInstruction(HloInstruction::CreateBinary(
+          param_shape, HloOpcode::kAdd, param, param)),
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0))),
+      {0}, reduce_computation));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(reduce_shape, HloOpcode::kSin, reduce));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyPtx(std::move(hlo_module), R"(
+    CHECK-LABEL: .entry sin
+    CHECK: {
+    CHECK-NOT: ld.global.nc.f32
+    CHECK: ld.global.f32
+    CHECK: }
+  )");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c42e5704a4d2e611a203293e60a86ba4104bca46
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class GpuNoAliasTest : public GpuCodegenTest {};
+
+TEST_F(GpuNoAliasTest, Concat) {
+  HloComputation::Builder builder(TestName());
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* param_y = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "y"));
+  HloInstruction* concat =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(F32, {2, 4}), {param_x, param_y}, 1));
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      ShapeUtil::MakeShape(F32, {2, 6}), {concat, param_x}, 1));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK: %[[x_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %x{{.*}}, i32 0
+; CHECK: load float, float* %[[x_gep]], {{.*}}, !noalias ![[param_noalias:.*]]
+; CHECK: %[[y_gep:.*]] = getelementptr inbounds [2 x [2 x float]], [2 x [2 x float]]* %y{{.*}}, i32 0
+; CHECK: load float, float* %[[y_gep]], {{.*}}, !noalias ![[param_noalias]]
+; CHECK: %[[result_ptr:.*]] = bitcast [2 x [6 x float]]* %fusion{{.*}} to float*
+; CHECK: %[[result_gep:.*]] = getelementptr inbounds float, float* %[[result_ptr]]
+; CHECK: store float {{.*}}, float* %[[result_gep]], !alias.scope ![[param_noalias]]
+; CHECK: ![[param_noalias]] = !{![[retval_buffer:.*]]}
+      )",
+                     /*match_optimized_ir=*/false);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..962293630683fcbbce3941f622061a2ff0f02dda
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuUnrollingTest : public GpuCodegenTest {};
+
+const char *const kAddModule = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[2,2]{1,0} parameter(0)
+      p1.param_1 = f32[2,2]{1,0} parameter(1)
+      ROOT add = f32[2,2] add(p0.param_0, p1.param_1)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT fusion = f32[2,2]{1,0} fusion(p0, p1), kind=kLoop,
+                                                  calls=fused_computation
+    })";
+
+TEST_F(GpuUnrollingTest, DoNotUnroll) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
+  config.set_debug_options(debug_options);
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollFourTimes) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  // We request a factor of 8, but the computation works on 4 elements, limiting
+  // the maximum unroll factor.
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(8);
+  config.set_debug_options(debug_options);
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
+  // The default unrolling factor is 4.
+  HloModuleConfig config;
+  config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+  auto hlo_module = ParseHloString(kAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(4);
+  config.set_debug_options(debug_options);
+
+  const char *const kUnfusedAddModule = R"(
+    HloModule test_module
+
+    ENTRY AddFunc {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT add = f32[2,2]{1,0} add(p0, p1)
+    })";
+  auto hlo_module = ParseHloString(kUnfusedAddModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @add
+; CHECK: load <4 x float>
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK: fadd
+; CHECK-NOT: fadd
+; CHECK: store <4 x float>
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
+  HloModuleConfig config;
+  auto debug_options = HloTestBase::GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(2);
+  config.set_debug_options(debug_options);
+
+  const char *const kMultiOutputFusionModule = R"(
+    HloModule test_module
+
+    fused_computation {
+      p0.param_0 = f32[2,2]{1,0} parameter(0)
+      p1.param_1 = f32[2,2]{1,0} parameter(1)
+      add = f32[2,2]{1,0} add(p0.param_0, p1.param_1)
+      mul = f32[2,2]{1,0} multiply(p0.param_0, p1.param_1)
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(add, mul)
+    }
+
+    ENTRY BroadcastIntoAdd {
+      p0 = f32[2,2]{1,0} parameter(0)
+      p1 = f32[2,2]{1,0} parameter(1)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p0, p1), kind=kLoop,
+                                                   calls=fused_computation
+    })";
+  auto hlo_module =
+      ParseHloString(kMultiOutputFusionModule, config).ValueOrDie();
+
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+; CHECK-LABEL: @fusion
+; CHECK: load <2 x float>
+; CHECK: load <2 x float>
+; CHECK-NOT: load <2 x float>
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: fadd
+; CHECK: fmul
+; CHECK: store <2 x float>
+; CHECK: store <2 x float>
+; CHECK-NOT: store <2 x float>
+; CHECK-NOT: fadd
+; CHECK-NOT: fmul
+; CHECK: }
+      )",
+                     /*match_optimized_ir=*/true);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9072b30317d253fd6d50e9d98949cad4eaebfe7b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/infeed_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class InfeedTest : public ClientLibraryTestBase {
+ protected:
+  // Transfers the given literal to the infeed interface of the device, and
+  // check if the returned data from Infeed HLO is same as the literal.
+  void TestInfeedRoundTrip(const Literal& literal) {
+    // TODO(b/30481585) Explicitly reset the Infeed state so that the
+    // test is not affected by the state from the previous tests.
+    ASSERT_IS_OK(client_->TransferToInfeed(literal));
+    XlaBuilder builder(TestName());
+    Infeed(&builder, literal.shape());
+    if (ShapeUtil::IsTuple(literal.shape())) {
+      // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
+      ComputeAndCompareTuple(&builder, literal, {});
+    } else {
+      ComputeAndCompareLiteral(&builder, literal, {});
+    }
+  }
+};
+
+TEST_F(InfeedTest, SingleInfeedR0Bool) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR0<bool>(true));
+}
+
+TEST_F(InfeedTest, SingleInfeedR1U32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR1<uint32>({1, 2, 3}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR2F32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR2F32Linspace(0.0, 1.0, 128, 64));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32) {
+  TestInfeedRoundTrip(
+      *LiteralUtil::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                              {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) {
+  const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2});
+  const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0});
+
+  TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+      {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+       {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+      r3_dim0minor));
+
+  TestInfeedRoundTrip(*LiteralUtil::CreateR3WithLayout(
+      {{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+       {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+      r3_dim0major));
+}
+
+TEST_F(InfeedTest, SingleInfeedR4S32) {
+  TestInfeedRoundTrip(*LiteralUtil::CreateR4(
+      {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}},
+       {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}}));
+}
+
+// Tests that a large infeed can be handled.
+TEST_F(InfeedTest, LargeInfeed) {
+  Array4D<float> array(80, 100, 8, 128);
+  array.FillIota(1.0f);
+  TestInfeedRoundTrip(*LiteralUtil::CreateR4FromArray4D<float>(array));
+}
+
+TEST_F(InfeedTest, SingleInfeedTuple) {
+  TestInfeedRoundTrip(
+      *LiteralUtil::MakeTuple({LiteralUtil::CreateR1<uint32>({1, 2, 3}).get(),
+                               LiteralUtil::CreateR0<bool>(false).get()}));
+}
+
+TEST_F(InfeedTest, SingleInfeedEmptyTuple) {
+  TestInfeedRoundTrip(*LiteralUtil::MakeTuple({}));
+}
+
+// Tests that a large tuple infeed can be handled.
+TEST_F(InfeedTest, SingleInfeedLargeTuple) {
+  Array4D<float> array(40, 100, 8, 128);
+  array.FillIota(1.0f);
+  TestInfeedRoundTrip(*LiteralUtil::MakeTuple(
+      {LiteralUtil::CreateR4FromArray4D<float>(array).get(),
+       LiteralUtil::CreateR0<int32>(5).get()}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.cc b/tensorflow/compiler/xla/service/gpu/thunk.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c78605cebbc671272b8df9faf0e0cc54be2f5b1c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/thunk.cc
@@ -0,0 +1,59 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/thunk.h"
+
+namespace xla {
+namespace gpu {
+
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
+  switch (kind) {
+    case Thunk::kConditional:
+      return os << "kConditional";
+    case Thunk::kConvolution:
+      return os << "kConvolution";
+    case Thunk::kCopy:
+      return os << "kCopy";
+    case Thunk::kCudnnBatchNormBackward:
+      return os << "kCudnnBatchNormBackward";
+    case Thunk::kCudnnBatchNormForwardInference:
+      return os << "kCudnnBatchNormForwardInference";
+    case Thunk::kCudnnBatchNormForwardTraining:
+      return os << "kCudnnBatchNormForwardTraining";
+    case Thunk::kFft:
+      return os << "kFft";
+    case Thunk::kGemm:
+      return os << "kGemm";
+    case Thunk::kInfeed:
+      return os << "kInfeed";
+    case Thunk::kKernel:
+      return os << "kKernel";
+    case Thunk::kMemset32BitValue:
+      return os << "kMemset32BitValue";
+    case Thunk::kMemzero:
+      return os << "kMemzero";
+    case Thunk::kOutfeed:
+      return os << "kOutfeed";
+    case Thunk::kSequential:
+      return os << "kSequential";
+    case Thunk::kTuple:
+      return os << "kTuple";
+    case Thunk::kWhile:
+      return os << "kWhile";
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 99a1a0eae9b60bca8ae8443c98a19fc62d4dddfd..4df0bb005b623e5ac79a4dfcb7c5a8a7a400940c 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -41,7 +41,7 @@ class GpuExecutable;
 // This is thread-compatible.
 class Thunk {
  public:
-  enum class Kind {
+  enum Kind {
     kConditional,
     kConvolution,
     kCopy,
@@ -111,6 +111,8 @@ class Thunk {
 // A sequence of thunks.
 using ThunkSequence = std::vector<std::unique_ptr<Thunk>>;
 
+std::ostream& operator<<(std::ostream& os, Thunk::Kind kind);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 1315a4183a98d6ea9ed4c82d4c22e77c2109ec83..d81d87e7dc54cd752000b85f3ec173d66d7195e4 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -57,6 +57,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
   while (true) {
     // Invoke thunk sequence for while 'condition' computation.
     profiler->StartHloComputation();
+    VLOG(3) << "Executing condition computation";
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(
         buffer_allocations, stream, profiler));
     profiler->FinishHloComputation(hlo_instruction()->while_condition());
@@ -64,6 +65,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
     stream->ThenMemcpy(&condition_result, condition_result_data, sizeof(bool));
+    VLOG(3) << "condition_result = " << condition_result;
     Status block_status = stream->BlockHostUntilDone();
     if (!block_status.ok()) {
       return InternalError(
@@ -78,6 +80,7 @@ Status WhileThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
     // We measure the time of one execution of the while body computation. The
     // while body may be executed more than once, the last measurement "wins".
     profiler->StartHloComputation();
+    VLOG(3) << "Executing body computation";
     // Invoke thunk sequence for while 'body' computation, and pass on
     // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(buffer_allocations,
diff --git a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
index 737c7eb02532dd6e9385c58684e46e7aa0a424fb..dd46ff433ba0ad6bfa3999b96845fdaebe148aca 100644
--- a/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
+++ b/tensorflow/compiler/xla/service/gpu/xfeed_queue.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_XFEED_QUEUE_H_
 
 #include <deque>
+#include <functional>
 #include <vector>
 
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 87abc0e74f6ca06fb4245ee6f83471fdc708c876..63a8a813cddf304e60fa9b4bbf709eca2d7c2cae 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -244,8 +244,9 @@ message BufferAllocationProto {
   int64 index = 1;
   int64 size = 2;
   bool is_thread_local = 3;
-  bool is_reusable = 4;
+  bool is_tuple = 11;
   bool is_entry_computation_parameter = 5;
+  bool is_constant = 12;
   int64 parameter_number = 6;
   repeated int64 parameter_shape_index = 10;
   bool maybe_live_out = 7;
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.h b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
index afb0c20f0cdf3eb92f72ab8bc368b4b8d723459e..1fea544730c27efdaa260f55ea81c163165f7ed5 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.h
@@ -42,7 +42,7 @@ class HloAliasAnalysis {
   static StatusOr<std::unique_ptr<HloAliasAnalysis>> Run(
       HloModule* module,
       const HloDataflowAnalysis::FusionCanShareBufferFunction&
-          fusion_can_share_buffer = nullptr);
+          fusion_can_share_buffer);
 
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 403d4df6b502e4cceaf5b7341278590b3973153f..da94ab5346e5628b4a603b3ac2d84071904d1e65 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -47,7 +47,9 @@ class HloAliasAnalysisTest : public HloTestBase {
   // reference to the generated analysis stored in analysis_.
   HloAliasAnalysis& RunAnalysis() {
     hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
-    analysis_ = HloAliasAnalysis::Run(module_.get()).ConsumeValueOrDie();
+    analysis_ = HloAliasAnalysis::Run(module_.get(),
+                                      /*fusion_can_share_buffer=*/nullptr)
+                    .ConsumeValueOrDie();
     return *analysis_;
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index c49cf7f5db5ee9100718fbcd87dc5bdcc175ae5f..1f672502f72f9c658b681383e858995f6e94d2c7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -155,6 +155,10 @@ Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleIota(const HloInstruction*) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 0181138a6dc554438957e8545c66a98d32dd68d5..82d650dc7b2a7fdd7c156d5fadcabd40f5535161 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -52,6 +52,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleElementwiseUnary(const HloInstruction* hlo) override;
   Status HandleElementwiseBinary(const HloInstruction* hlo) override;
   Status HandleConstant(const HloInstruction* constant) override;
+  Status HandleIota(const HloInstruction* iota) override;
   Status HandleGetTupleElement(
       const HloInstruction* get_tuple_element) override;
   Status HandleSelect(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 9fd0363f578f562aa30bd3dadfea2c251a722af8..2c854eea18642eb7cb081b4fdfe3bc83627e41ae 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 76b9c66651018089b6ca55b04e12df7c19ebbfb9..90fbaa37c5a70a78a9a818b4a8968f3406c671b1 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -239,7 +239,7 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   EXPECT_EQ(5, computation->instruction_count());
   EXPECT_THAT(tuple, op::Tuple(exp1, exp2, exp3));
 
-  HloCSE cse(/*is_layout_sensitive=*/false);
+  HloCSE cse(/*is_layout_sensitive=*/true);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -248,6 +248,183 @@ TEST_F(HloCseTest, IdenticalInstructions) {
   EXPECT_THAT(tuple, op::Tuple(first_operand, first_operand, first_operand));
 }
 
+// Test two identical while loops with same inputs
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesSameInput) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsAndBodiesSameInput
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsAndBodiesSameInput () -> (f32[], f32[])
+{ %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2) %tuple.1 =
+(f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3) %while = (f32[],
+f32[]) while((f32[], f32[]) %tuple.1), condition=%condition, body=%body ROOT
+%while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition.1, body=%body
+    }
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(4, computation->instruction_count());
+}
+
+// Test two while loops with same conditions, same inputs, but different
+// bodies
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsSameInputAndDifferentBodies) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsSameInputAndDifferentBodies
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %body2 (param.1: (f32[], f32[])) -> (f32[], f32[]) {
+      %param.1 = (f32[], f32[]) parameter(0)
+      %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %param.1),
+index=0 %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %param.1),
+index=1 %sub = f32[] subtract(f32[] %get-tuple-element.2, f32[]
+%get-tuple-element.3) ROOT %tuple.2 = (f32[], f32[]) tuple(f32[]
+%get-tuple-element.2, f32[] %sub)
+    }
+
+    %condition (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.3: (f32[], f32[])) -> pred[] {
+      %param.3 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsSameInputAndDifferentBodies () ->
+(f32[], f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
+f32[]) %tuple.1), condition=%condition.1, body=%body2
+    }
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(5, computation->instruction_count());
+}
+
+// Test two identical while loops with different inputs
+TEST_F(HloCseTest, WhileLoopsIdenticalConditionsAndBodiesDifferentInput) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalConditionsAndBodiesDifferentInput
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(false)
+    }
+
+    ENTRY %WhileLoopsIdenticalConditionsAndBodiesDifferentInput () -> (f32[],
+f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body %constant.4 = f32[] constant(1) %constant.5 =
+f32[] constant(2) %tuple.2 = (f32[], f32[]) tuple(f32[] %constant.4, f32[]
+%constant.5) ROOT %while.1 = (f32[], f32[]) while((f32[], f32[]) %tuple.2),
+condition=%condition.1, body=%body
+    }
+
+    )")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(8, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(8, computation->instruction_count());
+}
+
+// Test two while loops with identical bodies and same inputs, but different
+// conditions
+TEST_F(HloCseTest, WhileLoopsIdenticalBodiesAndInputDifferntConditions) {
+  auto module = ParseHloString(R"(
+    HloModule WhileLoopsIdenticalBodiesAndInputDifferntConditions
+
+    %body (param: (f32[], f32[])) -> (f32[], f32[]) {
+      %param = (f32[], f32[]) parameter(0)
+      %get-tuple-element = f32[] get-tuple-element((f32[], f32[]) %param),
+index=0 %get-tuple-element.1 = f32[] get-tuple-element((f32[], f32[]) %param),
+index=1 %add = f32[] add(f32[] %get-tuple-element, f32[] %get-tuple-element.1)
+      ROOT %tuple = (f32[], f32[]) tuple(f32[] %get-tuple-element, f32[] %add)
+    }
+
+    %condition (param.1: (f32[], f32[])) -> pred[] {
+      %param.1 = (f32[], f32[]) parameter(0)
+      ROOT %constant = pred[] constant(false)
+    }
+
+    %condition.1 (param.2: (f32[], f32[])) -> pred[] {
+      %param.2 = (f32[], f32[]) parameter(0)
+      ROOT %constant.1 = pred[] constant(true)
+    }
+
+    ENTRY %WhileLoopsIdenticalBodiesAndInputDifferntConditions () -> (f32[],
+f32[]) { %constant.2 = f32[] constant(1) %constant.3 = f32[] constant(2)
+      %tuple.1 = (f32[], f32[]) tuple(f32[] %constant.2, f32[] %constant.3)
+      %while = (f32[], f32[]) while((f32[], f32[]) %tuple.1),
+condition=%condition, body=%body ROOT %while.1 = (f32[], f32[]) while((f32[],
+f32[]) %tuple.1), condition=%condition.1, body=%body
+    })")
+                    .ValueOrDie();
+
+  auto computation = module->entry_computation();
+
+  EXPECT_EQ(5, computation->instruction_count());
+  HloCSE cse(true);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+  EXPECT_EQ(5, computation->instruction_count());
+}
+
 TEST_F(HloCseTest, IdenticalInstructionsDifferentLayoutsSensitive) {
   // Test that two identical instructions with different layouts are *not*
   // commoned if the pass is layout sensitive.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index de1a32d8bd9217baabda4ab4b02bf28baebad531..1abfcb77036ba829e44575f9404b8993afbfe1d7 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -1017,19 +1017,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
   }
 
   if (user->opcode() == HloOpcode::kFusion) {
+    if (fusion_can_share_buffer_ != nullptr) {
+      return fusion_can_share_buffer_(user, operand);
+    }
     // Get the parameter associated with 'operand';
     HloInstruction* fusion_param =
         user->fused_parameter(user->operand_index(operand));
 
     const HloValue& value = GetValueDefinedAt(fusion_param, operand_index);
-    if (value.uses().size() != 1) {
-      if (MultiDynamicSliceUseShareSameIndices(value.uses())) {
-        return true;
-      }
-      return false;
+    if (MultiDynamicSliceUseShareSameIndices(value.uses())) {
+      return true;
     }
-    const HloUse& use = value.uses()[0];
-
     if (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
         user->fusion_kind() == HloInstruction::FusionKind::kInput) {
       if (user->fused_expression_root()->opcode() ==
@@ -1039,13 +1037,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
         // Returns true iff there is exactly one use of 'operand' at shape index
         // 'operand_index', and this singleton use is the fused root at operand
         // index 0.
-        return use.instruction == user->fused_expression_root() &&
-               use.operand_number == 0;
-      } else {
-        return AreTransitiveUsesElementwiseOrTuple(fusion_param);
+        if (value.uses().size() == 1) {
+          const HloUse& use = value.uses()[0];
+          return use.instruction == user->fused_expression_root() &&
+                 use.operand_number == 0;
+        }
+        return false;
       }
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      return AreTransitiveUsesElementwiseOrTuple(fusion_param);
+    }
+    if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+        user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
 
       // Check if one operand of kAdd fused root is kDot or kConvolution.
@@ -1066,11 +1068,12 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
       // Returns true iff there is exactly one use of 'operand' at shape index
       // 'operand_index', and this singleton use is the fused root (at operand
       // index 'other_add_operand_index').
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == other_add_operand_index;
-    } else if (fusion_can_share_buffer_ != nullptr &&
-               fusion_can_share_buffer_(user, operand)) {
-      return true;
+      if (value.uses().size() == 1) {
+        const HloUse& use = value.uses()[0];
+        return use.instruction == user->fused_expression_root() &&
+               use.operand_number == other_add_operand_index;
+      }
+      return false;
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 5f575b24a1fb36c5384592028e0f1f6a8e9404b6..cba72469ce73603f05d9957eb64e8519e8b8aec0 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index c0a8ea8bcbd7dbcdb26a6f40f9e6fcb6dd42bd7f..d5b4be7e1284509a4494b0e804e5396c7cfcecc2 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1997,6 +1997,30 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return HandleReducePrecision<ElementwiseT>(reduce_precision);
   }
 
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, float>::value ||
+                std::is_same<NativeT, int32>::value ||
+                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+  Status HandleIota(HloInstruction* iota) {
+    auto result = MakeUnique<Literal>(iota->shape());
+    auto data = result->data<ReturnT>();
+    std::iota(data.begin(), data.end(), 0);
+    parent_->evaluated_[iota] = std::move(result);
+    return Status::OK();
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                !(std::is_same<NativeT, float>::value ||
+                  std::is_same<NativeT, int32>::value ||
+                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+  Status HandleIota(HloInstruction* iota) {
+    return InvalidArgument("Unsupported type for iota");
+  }
+  Status HandleIota(HloInstruction* iota) override {
+    return HandleIota<ReturnT>(iota);
+  }
+
  private:
   // Creates a vector of multipliers which can be used to create a linear index
   // into shape.
@@ -2054,10 +2078,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              start_indices_typed.end());
 
     // Clamp the start indices so the slice is in-bounds w.r.t the operand.
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
     for (int64 i = 0; i < start.size(); ++i) {
       start[i] = std::min<int64>(
           std::max(int64{0}, start[i]),
@@ -2091,10 +2111,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                              start_indices_typed.end());
     // Clamp the update start indices so the slice is in-bounds w.r.t the
     // operand.
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to oficially document different behavior.
     for (int64 i = 0; i < rank; ++i) {
       start[i] = std::min<int64>(
           std::max<int64>(0, start[i]),
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 57cf34d7dee94f0303cd6b5591ad6507828c70f3..fd5085bed234068a1bdf18977b38d92badc02a49 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -948,6 +948,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGe:
     case HloOpcode::kGt:
     case HloOpcode::kImag:
+    case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index ae30d2ad8dab2ec695fed8e9a57549e290fc373e..8b9bdd2f46fe8a63b419b45ef2c2a2e025c60c8f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -463,6 +463,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   return MakeUnique<HloConstantInstruction>(std::move(literal));
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateIota(
+    const Shape& shape) {
+  return WrapUnique(new HloInstruction(HloOpcode::kIota, shape));
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateGetTupleElement(const Shape& shape,
                                       HloInstruction* operand, int64 index) {
@@ -1119,6 +1124,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kSort:
     case HloOpcode::kGather:
+    case HloOpcode::kIota:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1516,8 +1522,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTupleSelect:
       return true;
 
-    // These opcodes have complex or special behavior so just return false.
-    case HloOpcode::kWhile:
+    // This opcode has complex or special behavior so just return false.
     case HloOpcode::kAfterAll:
       return false;
 
@@ -1533,6 +1538,14 @@ bool HloInstruction::IdenticalSlowPath(
       return eq_computations(true_computation(), other.true_computation()) &&
              eq_computations(false_computation(), other.false_computation());
 
+    case HloOpcode::kWhile: {
+      if (eq_computations(while_body(), other.while_body()) &&
+          eq_computations(while_condition(), other.while_condition())) {
+        return true;
+      }
+      return false;
+    }
+
     case HloOpcode::kDomain:
       return operand_side_metadata().Matches(other.operand_side_metadata()) &&
              user_side_metadata().Matches(other.user_side_metadata());
@@ -1556,6 +1569,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
     case HloOpcode::kConstant:
+    case HloOpcode::kIota:
     case HloOpcode::kTrace:
     case HloOpcode::kFusion:
     case HloOpcode::kRng:
@@ -1576,6 +1590,7 @@ bool HloInstruction::IdenticalSlowPath(
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
+  return false;
 }
 
 void HloInstruction::RemoveUser(HloInstruction* user) {
@@ -2300,6 +2315,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleDomain(this);
     case HloOpcode::kAfterAll:
       return visitor->HandleAfterAll(this);
+    case HloOpcode::kIota:
+      return visitor->HandleIota(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c6faa69a7866f7851545e96beb1bc232e7267460..30bff286c20033ec193fb29d8c4c935ce6475a27 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -346,6 +346,9 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateConstant(
       std::unique_ptr<Literal> literal);
 
+  // Creates an Iota instruction.
+  static std::unique_ptr<HloInstruction> CreateIota(const Shape& shape);
+
   // Creates a get tuple element instruction.
   static std::unique_ptr<HloInstruction> CreateGetTupleElement(
       const Shape& shape, HloInstruction* operand, int64 index);
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 39e12c48157992410a5d3b733720d677a1191611..59e9a5a94aa4fc6270bde76c19dbd0d4506a563c 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -87,6 +87,7 @@ namespace xla {
   V(kHostCompute, "host-compute")                            \
   V(kImag, "imag")                                           \
   V(kInfeed, "infeed")                                       \
+  V(kIota, "iota")                                           \
   V(kIsFinite, "is-finite")                                  \
   V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
   V(kLog, "log")                                             \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 496eca07399847a1280c90bafd9d24b8db74d70c..d71d3c81702fb3d2adae82b1055464e4983eb891 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -492,6 +492,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           HloInstruction::CreateConstant(std::move(literal)));
       break;
     }
+    case HloOpcode::kIota: {
+      if (!ParseOperands(&operands, /*expected_size=*/0) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateIota(shape));
+      break;
+    }
     // Unary ops.
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
@@ -1124,13 +1132,24 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kCustomCall: {
       optional<string> custom_call_target;
+      optional<Window> window;
+      optional<ConvolutionDimensionNumbers> dnums;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
+      attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
+      attrs["dim_labels"] = {/*required=*/false,
+                             AttrTy::kConvolutionDimensionNumbers, &dnums};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(HloInstruction::CreateCustomCall(
           shape, operands, *custom_call_target));
+      if (window.has_value()) {
+        instruction->set_window(*window);
+      }
+      if (dnums.has_value()) {
+        instruction->set_convolution_dimension_numbers(*dnums);
+      }
       break;
     }
     case HloOpcode::kHostCompute: {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 6ba34cf22a98d28dfdeb3e3de52aadd6be3512c7..1c08c51220e88cdd04b26fe3bcd84d28c4436e85 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1004,6 +1004,28 @@ ENTRY CrossReplicaSumWithSubgroups {
   ROOT cross-replica-sum = f32[128,32]{0,1} cross-replica-sum(input), replica_group_ids={0,0,1,1}, barrier="abc", to_apply=add
 }
 
+)"
+},
+// Iota
+{
+"Iota",
+R"(HloModule iota
+
+ENTRY Iota {
+  ROOT iota = f32[100]{0} iota()
+}
+
+)"
+},
+// custom-call with window and dim_labels
+{
+"CustomCallWithWindowAndDimLabels",
+R"(HloModule CustomCallWithWindowAndDimLabels
+
+ENTRY Computation {
+  ROOT r = f32[100]{0} custom-call(), window={size=2x2}, dim_labels=b01f_01io->b01f, custom_call_target="target"
+}
+
 )"
 }
   });
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 59a8800a7d6e9417c0e561db45341c912ad20464..cf0be30c7ad5cbeb7fd3d71c7c649b6b448360b8 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1203,7 +1203,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
 StatusOr<bool> HloRematerialization::Run(
     HloModule* module, SequentialHloOrdering::HloModuleSequence* sequence,
     int64 memory_limit_bytes, RematerializationSizes* sizes,
-    bool run_copy_elision) {
+    CopyInsertion* copy_insertion) {
   // The sequence is constructed entirely by this method.
   TF_RET_CHECK(sequence->empty());
 
@@ -1238,13 +1238,14 @@ StatusOr<bool> HloRematerialization::Run(
                                        return size_function_(buffer.shape());
                                      },
                                      scheduler_algorithm_));
-  if (run_copy_elision) {
+  if (copy_insertion) {
     // We run a separate pass of copy elision here because the sequential
     // ordering from the HLO schedule allows for more copies to be eliminated.
     // TODO(b/80249101): Instead of a separate copy elision pass, use the
     // ordering from the HLO schedule directly for copy insertion.
     SequentialHloOrdering ordering(module, *sequence);
-    TF_RETURN_IF_ERROR(RemoveUnnecessaryCopies(ordering, module));
+    TF_RETURN_IF_ERROR(
+        copy_insertion->RemoveUnnecessaryCopies(ordering, module));
   }
 
   // Compute peak memory usage of all computations in the module called in a
@@ -1349,10 +1350,10 @@ StatusOr<bool> HloRematerialization::Run(
     int64 memory_limit_bytes, HloModule* hlo_module,
     MemorySchedulerAlgorithm scheduler_algorithm,
     SequentialHloOrdering::HloModuleSequence* sequence,
-    RematerializationSizes* sizes, bool run_copy_elision) {
+    RematerializationSizes* sizes, CopyInsertion* copy_insertion) {
   HloRematerialization remat(scheduler_algorithm, size_function);
   return remat.Run(hlo_module, sequence, memory_limit_bytes, sizes,
-                   run_copy_elision);
+                   copy_insertion);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 59b4cf5dcc761f70767ce4d7ff0959448f29939a..2ec004350ad88ff31ece90ec419d90a55b965166 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -17,6 +17,7 @@
 
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -57,8 +58,9 @@ class HloRematerialization {
   //   sizes: Optional outparam that indicates the peak memory usage of the HLO
   //     module before/after rematerialization.
   //
-  //   run_copy_elision: Enable copy elision. This pass is used to eliminate
-  //     copies that were inserted before HLO scheduling.
+  //   copy_insertion: If non-null, run copy elision after scheduling. This
+  //     pass is used to eliminate copies that were inserted by copy insertion
+  //     before HLO scheduling.
   //
   // TODO(b/80249101): Remove the 'run_copy_elision' parameter when copy
   // insertion is integrated with HLO scheduling.
@@ -74,7 +76,7 @@ class HloRematerialization {
       const ShapeSizeFunction& size_function, int64 memory_limit_bytes,
       HloModule* hlo_module, MemorySchedulerAlgorithm scheduler_algorithm,
       SequentialHloOrdering::HloModuleSequence* sequence,
-      RematerializationSizes* sizes, bool run_copy_elision = true);
+      RematerializationSizes* sizes, CopyInsertion* copy_insertion = nullptr);
 
  protected:
   HloRematerialization(MemorySchedulerAlgorithm scheduler_algorithm,
@@ -90,7 +92,7 @@ class HloRematerialization {
   StatusOr<bool> Run(HloModule* module,
                      SequentialHloOrdering::HloModuleSequence* sequence,
                      int64 memory_limit, RematerializationSizes* sizes,
-                     bool run_copy_elision);
+                     CopyInsertion* copy_insertion);
 
   // Rematerializes instructions within the given computation. 'order' is the
   // order in which the computation's instructions will be emitted in the
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index cd131147e619003d7ff4888e0f9e4da586bc2e76..ac8c97d380953764b66135ad1c5fcee0d481c004 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -147,7 +147,7 @@ class HloRematerializationTest : public HloTestBase {
     TF_EXPECT_OK(verifier().Run(module).status());
     return HloRematerialization::RematerializeAndSchedule(
         ByteSizeOf, memory_limit_bytes, module, DefaultMemoryScheduler,
-        sequence, /*sizes=*/nullptr, /*run_copy_elision=*/false);
+        sequence, /*sizes=*/nullptr);
   }
 
   // Various shapes used in the canned computations.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 4f0569f4059481aa19da8c7854fedf0e43182e36..b2725e2918ce76248d9f2cdbb2a6e5a63226bf9a 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -180,8 +180,12 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       CreateExecutable(std::move(module), run_hlo_passes));
-  return executable->ExecuteOnStreamWrapper(&service_run_options,
-                                            /*profile=*/profile, arguments);
+  TF_ASSIGN_OR_RETURN(
+      ScopedShapedBuffer retval,
+      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                         /*profile=*/profile, arguments));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return std::move(retval);
 }
 
 StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
@@ -309,6 +313,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
 
   std::vector<std::unique_ptr<Literal>> exec_results;
   for (int64 i = 0; i < options.num_replicas; ++i) {
+    TF_RETURN_IF_ERROR(streams[i]->BlockHostUntilDone());
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         backend().transfer_manager()->TransferLiteralFromDevice(
                             streams[i].get(), results[i]));
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 6a32093b6e76210893c9360acbd02e10f1777196..25fa319faf13d8bef69381c869f08f4948fc3519 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -119,7 +119,7 @@ Status CheckIsTokenOperand(const HloInstruction* instruction,
   const HloInstruction* token = instruction->operand(operand_no);
   if (!ShapeUtil::Equal(token->shape(), ShapeUtil::MakeTokenShape())) {
     return InternalError(
-        "Expected operand %lld to be token-shaped, actual shape is"
+        "Expected operand %lld to be token-shaped, actual shape is "
         "%s:\n%s",
         operand_no, ShapeUtil::HumanString(token->shape()).c_str(),
         instruction->ToString().c_str());
@@ -210,6 +210,12 @@ Status ShapeVerifier::HandleConstant(HloInstruction* constant) {
   return CheckShape(constant, constant->literal().shape());
 }
 
+Status ShapeVerifier::HandleIota(HloInstruction* iota) {
+  return ShapeUtil::Rank(iota->shape()) == 1
+             ? Status::OK()
+             : InternalError("Iota only supports arrays of rank 1.");
+}
+
 Status ShapeVerifier::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   return CheckShape(get_tuple_element,
                     ShapeInference::InferGetTupleElementShape(
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 810c66cf021bcfdecbea346dd2fdd22607066499..79f7aa9f4ce66cc9b53d016f2e126033492c81e9 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -37,6 +37,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleSelect(HloInstruction* select) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleConcatenate(HloInstruction* concatenate) override;
+  Status HandleIota(HloInstruction* iota) override;
   Status HandleConvert(HloInstruction* convert) override;
   Status HandleBitcastConvert(HloInstruction* convert) override;
   Status HandleCopy(HloInstruction* copy) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index da91262130933b6d47fd95fb30bf89574b9469d6..af07370135ca2b2e53fcbcb53696e0aa12bf7a6f 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -73,6 +73,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kGt:
     case HloOpcode::kImag:
     case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLt:
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index c14a5bfb53f21cb2ed9f48009011ebf788e2dc25..cdd3daf73b8ac1a4d1ec3c81224c2c0bfe8e5811 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -180,6 +180,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sort_util",
+    srcs = ["sort_util.cc"],
+    hdrs = ["sort_util.h"],
+    deps = [
+        ":ir_array",
+        ":llvm_loop",
+        ":llvm_util",
+        ":loop_emitter",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service/gpu:parallel_loop_emitter",
+        "//tensorflow/compiler/xla/service/gpu:partition_assignment",
+        "//tensorflow/core:lib",
+        "@llvm//:core",
+    ],
+)
+
 cc_library(
     name = "tuple_ops",
     srcs = ["tuple_ops.cc"],
@@ -206,3 +223,22 @@ cc_library(
         "@llvm//:core",
     ],
 )
+
+cc_library(
+    name = "buffer_assignment_util",
+    srcs = ["buffer_assignment_util.cc"],
+    hdrs = ["buffer_assignment_util.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+    ],
+)
+
+cc_library(
+    name = "math_ops",
+    srcs = ["math_ops.cc"],
+    hdrs = ["math_ops.h"],
+    deps = [
+        ":llvm_util",
+        "@llvm//:core",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
index 93a8c130e1af7ca90b3dc14661deb978ff97bece..e5370eca56f2e3a891523ba2b72961d66ec809aa 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis.cc
@@ -28,8 +28,7 @@ namespace llvm_ir {
 // Sentry allocation used to represent parameters of the entry computation in
 // alias_scope_metadata_ and noalias_metadata_.
 static const BufferAllocation* kParameterAllocation = new BufferAllocation(
-    /*index=*/-1, /*size=*/0, /*is_thread_local=*/false, /*is_reusable=*/false,
-    LogicalBuffer::Color(0));
+    /*index=*/-1, /*size=*/0, LogicalBuffer::Color(0));
 
 void AliasAnalysis::AddAliasingInformationToIrArray(const HloInstruction& hlo,
                                                     llvm_ir::IrArray* array,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index 2552ff4a6a06d18f34b4ba224b66d6d97ddd74d3..941d940684651792467a84e816a91533ce11dd63 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,7 +58,7 @@ ENTRY while3 {
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* align 4 dereferenceable(4) %retval
 ; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
-; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:.*]]
+; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* align 1 dereferenceable(1) %fusion, i8* noalias %run_options, i8** noalias %params
 ; CHECK: %[[cond_state_buf_ptr:.*]] = getelementptr inbounds i8*, i8** %params, i64 0
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4eb5d9fb4750927ca189e02f312b2d6be7fdd418
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
+
+namespace xla {
+namespace llvm_ir {
+static const HloInstruction& InstrForConstantBufferAllocation(
+    const BufferAllocation& allocation) {
+  CHECK(allocation.is_constant());
+  HloInstruction* const_instr = nullptr;
+  for (const auto& buffer_offset_pair : allocation.assigned_buffers()) {
+    const LogicalBuffer* buffer = buffer_offset_pair.first;
+    // BufferAssignment may have assigned non-constant instructions to this
+    // allocation too so we can't CHECK this condition.  E.g. for
+    //
+    //   while(init = constant, body = identity, cond = ...)
+    //
+    // the LogicalBuffer for the kWhile instruction will have the same
+    // BufferAllocation as the LogicalBuffer for the (init) constant.
+    if (buffer->instruction()->opcode() == HloOpcode::kConstant) {
+      CHECK_EQ(const_instr, nullptr)
+          << const_instr->ToString() << " " << buffer->ToString();
+      const_instr = buffer->instruction();
+    }
+  }
+  CHECK_NE(const_instr, nullptr);
+  return *const_instr;
+}
+
+string ConstantBufferAllocationToGlobalName(
+    const BufferAllocation& allocation) {
+  string instr_name = InstrForConstantBufferAllocation(allocation).name();
+  for (char& c : instr_name) {
+    if (c == '.') {
+      c = '_';
+    }
+  }
+  return tensorflow::strings::StrCat("buffer_for_", instr_name);
+}
+
+const Literal& LiteralForConstantAllocation(
+    const BufferAllocation& allocation) {
+  return InstrForConstantBufferAllocation(allocation).literal();
+}
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..bfb6eecb87f6a1b756b3a8da3377f608dd7f0be7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
+
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+
+namespace xla {
+namespace llvm_ir {
+// In XLA:GPU we map constant buffer allocations to globals in the generated
+// LLVM IR.  This function gives us the name of the global variable a constant
+// buffer is mapped to.  Not used on XLA:CPU.
+string ConstantBufferAllocationToGlobalName(const BufferAllocation& allocation);
+
+// Returns the Literal corresponding to `allocation`, which must be a constant
+// allocation.
+const Literal& LiteralForConstantAllocation(const BufferAllocation& allocation);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_BUFFER_ASSIGNMENT_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
index 7048fcfdc90914deacf861c9c586b4bdc842ef41..27fbb11e2ede66a1268e7e949634b2c7d29cbc1c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.cc
@@ -40,14 +40,14 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     const Shape& update_shape, const ElementGenerator& start_indices_generator,
     bool is_signed, ElementGenerator update_array_generator,
     const IrArray& output_array, const gpu::LaunchDimensions* launch_dimensions,
-    tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder) {
+    tensorflow::StringPiece name, llvm::IRBuilder<>* b) {
   const Shape& output_shape = output_array.GetShape();
 
   // Read start indices from start_indices_generator.
   const int64 rank = ShapeUtil::Rank(output_shape);
-  IrArray::Index start_index(ir_builder->getInt64Ty(), rank);
+  IrArray::Index start_index(b->getInt64Ty(), rank);
   for (int64 i = 0; i < rank; ++i) {
-    IrArray::Index dim_index({ir_builder->getInt64(i)});
+    IrArray::Index dim_index({b->getInt64(i)});
     TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
     llvm::Value* output_dim_size = llvm::ConstantInt::get(
         start_index[i]->getType(), output_shape.dimensions(i));
@@ -56,24 +56,19 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
 
     // Clamp the start index so that the update region fits in the operand.
     // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
-
-    // TODO(b/74360564): This is implementation defined behavior, but is
-    // currently respected by all implementations. Change this if we ever decide
-    // to officially document different behavior.
-    llvm::Value* max_bound =
-        ir_builder->CreateSub(output_dim_size, update_dim_size);
+    llvm::Value* max_bound = b->CreateSub(output_dim_size, update_dim_size);
     llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
-    start_index[i] = ir_builder->CreateSelect(
-        ir_builder->CreateICmp(
-            is_signed ? llvm::ICmpInst::ICMP_SGE : llvm::ICmpInst::ICMP_UGE,
-            zero, start_index[i]),
-        zero, start_index[i]);
-
-    start_index[i] = ir_builder->CreateSelect(
-        ir_builder->CreateICmp(
-            is_signed ? llvm::ICmpInst::ICMP_SLE : llvm::ICmpInst::ICMP_ULE,
-            max_bound, start_index[i]),
-        max_bound, start_index[i]);
+    start_index[i] =
+        b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SGE
+                                                : llvm::ICmpInst::ICMP_UGE,
+                                      zero, start_index[i]),
+                        zero, start_index[i]);
+
+    start_index[i] =
+        b->CreateSelect(b->CreateICmp(is_signed ? llvm::ICmpInst::ICMP_SLE
+                                                : llvm::ICmpInst::ICMP_ULE,
+                                      max_bound, start_index[i]),
+                        max_bound, start_index[i]);
   }
 
   auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
@@ -84,31 +79,30 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
     //
     IrArray::Index output_index(start_index.GetType(), rank);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast(
-          start_index[i], update_index[i]->getType());
-      output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]);
+      llvm::Value* start_index0 =
+          b->CreateSExtOrBitCast(start_index[i], update_index[i]->getType());
+      output_index[i] = b->CreateAdd(start_index0, update_index[i]);
     }
 
     // Do output[output_index] = update[update_index].
     TF_ASSIGN_OR_RETURN(llvm::Value * update_data,
                         update_array_generator(update_index));
-    output_array.EmitWriteArrayElement(output_index, update_data, ir_builder);
+    output_array.EmitWriteArrayElement(output_index, update_data, b);
     return Status::OK();
   };
 
   if (launch_dimensions != nullptr) {
     return gpu::ParallelLoopEmitter(loop_body_emitter, update_shape,
-                                    *launch_dimensions, ir_builder)
+                                    *launch_dimensions, b)
         .EmitLoop(name);
   }
-  return LoopEmitter(loop_body_emitter, update_shape, ir_builder)
-      .EmitLoop(name);
+  return LoopEmitter(loop_body_emitter, update_shape, b).EmitLoop(name);
 }
 
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
     const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   VLOG(2) << "EmitDynamicUpdateSliceInPlace for " << name;
 
   // No need to use operand_arrays[0], the input array of the
@@ -119,16 +113,16 @@ Status EmitDynamicUpdateSliceInPlace(
   Shape update_shape = update_array.GetShape();
 
   ElementGenerator start_indices_generator = [&](const IrArray::Index& index) {
-    return start_indices_array.EmitReadArrayElement(index, ir_builder);
+    return start_indices_array.EmitReadArrayElement(index, b);
   };
   ElementGenerator update_array_generator = [&](const IrArray::Index& index) {
-    return update_array.EmitReadArrayElement(index, ir_builder);
+    return update_array.EmitReadArrayElement(index, b);
   };
 
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices_array.GetShape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
-      output_array, /*launch_dimensions=*/nullptr, name, ir_builder);
+      output_array, /*launch_dimensions=*/nullptr, name, b);
 }
 
 // Shared implementation for EmitFusedDynamicUpdateSliceInPlace and
@@ -139,8 +133,7 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions* launch_dimensions,
-    llvm::IRBuilder<>* ir_builder) {
+    const gpu::LaunchDimensions* launch_dimensions, llvm::IRBuilder<>* b) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
   VLOG(2) << "EmitFusedDynamicUpdateSliceInPlace for "
           << fusion->ToShortString();
@@ -177,28 +170,27 @@ static Status EmitFusedDynamicUpdateSliceInPlaceImpl(
   bool is_signed = ShapeUtil::ElementIsSigned(start_indices->shape());
   return EmitDynamicUpdateSliceInPlaceImpl(
       update_shape, start_indices_generator, is_signed, update_array_generator,
-      fusion_output_array, launch_dimensions, IrName(fusion), ir_builder);
+      fusion_output_array, launch_dimensions, IrName(fusion), b);
 }
 
 Status EmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
       fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
-      /*launch_dimensions=*/nullptr, ir_builder);
+      /*launch_dimensions=*/nullptr, b);
 }
 
 Status EmitParallelFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions& launch_dimensions,
-    llvm::IRBuilder<>* ir_builder) {
+    const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b) {
   return EmitFusedDynamicUpdateSliceInPlaceImpl(
       fusion, fusion_operand_arrays, fusion_output_array, elemental_emitter,
-      &launch_dimensions, ir_builder);
+      &launch_dimensions, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
index 7f73fb6b29f37f9b50e1fe5a15036776349c4061..3502577d236a099e0b721b98217b758696966821 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h
@@ -66,7 +66,7 @@ inline bool CanEmitFusedDynamicUpdateSliceInPlace(
 Status EmitDynamicUpdateSliceInPlace(
     tensorflow::gtl::ArraySlice<IrArray> operand_arrays,
     const IrArray& output_array, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Given a loop-fusion node whose root is a dynamic-update-slice op whose
 // array-to-be-updated and output share the same buffer slice, emits
@@ -76,7 +76,7 @@ Status EmitFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Same as EmitFusedDynamicUpdateSliceInPlace, except emits a parallel loop with
 // the given launch dimensions.
@@ -84,8 +84,7 @@ Status EmitParallelFusedDynamicUpdateSliceInPlace(
     HloInstruction* fusion,
     tensorflow::gtl::ArraySlice<IrArray> fusion_operand_arrays,
     const IrArray& fusion_output_array, ElementalIrEmitter* elemental_emitter,
-    const gpu::LaunchDimensions& launch_dimensions,
-    llvm::IRBuilder<>* ir_builder);
+    const gpu::LaunchDimensions& launch_dimensions, llvm::IRBuilder<>* b);
 
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index b12ce97e286224fcc39de74095979ea9ae80d674..72ede377e1a505d5e4916915e18827e1a0f3fdf9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -52,7 +52,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
       // that would be regenerated without caching. But this might increase the
       // JIT compilation time.
       if (generated_value_bb == nullptr ||
-          generated_value_bb == ir_builder_->GetInsertBlock()) {
+          generated_value_bb == b_->GetInsertBlock()) {
         VLOG(3) << "The cached generated value is reused.";
         return generated_value;
       }
@@ -60,8 +60,7 @@ Status FusedIrEmitter::DefaultAction(HloInstruction* hlo) {
                  "a different BB ("
               << llvm_ir::AsString(generated_value_bb->getName())
               << ") from the current insertion block ("
-              << llvm_ir::AsString(ir_builder_->GetInsertBlock()->getName())
-              << ").";
+              << llvm_ir::AsString(b_->GetInsertBlock()->getName()) << ").";
     }
 
     TF_ASSIGN_OR_RETURN(
@@ -77,14 +76,14 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
   llvm::Constant* initializer =
       llvm_ir::ConvertLiteralToIrConstant(literal, module_);
   llvm::GlobalVariable* global = new llvm::GlobalVariable(
-      *ir_builder_->GetInsertBlock()->getModule(), initializer->getType(),
+      *b_->GetInsertBlock()->getModule(), initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
       /*Name=*/"");
   llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
       global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
   generators_[constant] = [=](const IrArray::Index& index) {
     return IrArray(shape_constant, constant->shape())
-        .EmitReadArrayElement(index, ir_builder_);
+        .EmitReadArrayElement(index, b_);
   };
 
   return Status::OK();
@@ -104,7 +103,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
   // Emit code to lookup tuple element pointer, and store it in 'gte_values_'.
   llvm::Value* tuple_element_ptr = llvm_ir::EmitGetTupleElement(
       get_tuple_element->shape(), get_tuple_element->tuple_index(),
-      /*alignment=*/1, it->second, ir_builder_, module_);
+      /*alignment=*/1, it->second, b_, module_);
   gte_values_.insert(std::make_pair(get_tuple_element, tuple_element_ptr));
   // Emit code to read base tuple element array (if non-tuple shaped).
   if (!ShapeUtil::IsTuple(get_tuple_element->shape())) {
@@ -112,7 +111,7 @@ Status FusedIrEmitter::HandleGetTupleElement(
         [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
       // TODO(b/34080002) Add aliasing information to tuple element IrArray.
       return IrArray(tuple_element_ptr, get_tuple_element->shape())
-          .EmitReadArrayElement(index, ir_builder_);
+          .EmitReadArrayElement(index, b_);
     };
   }
   return Status::OK();
@@ -129,16 +128,15 @@ Status FusedIrEmitter::HandleParameter(HloInstruction* parameter) {
         // want the AA info to be present before address spaces are inferred
         // (which is pretty late in the pipeline), so even if we had
         // address-space-based AA in LLVM, it wouldn't help us much here.
-        return ir_builder_->CreateLoad(
-            ir_builder_->CreateGEP(
-                param_tile_buffer,
-                {index.GetConstantWithIndexType(0), tiled_parameter_info_->x(),
-                 tiled_parameter_info_->y()}),
+        return b_->CreateLoad(
+            b_->CreateGEP(param_tile_buffer, {index.GetConstantWithIndexType(0),
+                                              tiled_parameter_info_->x(),
+                                              tiled_parameter_info_->y()}),
             "tiled_buffer");
       }
     }
     return parameter_arrays_[parameter->parameter_number()]
-        .EmitReadArrayElement(index, ir_builder_);
+        .EmitReadArrayElement(index, b_);
   };
   // Store ir value for fusion operand associated with fusion parameter to be
   // accessed by subsequent fused GetTupleElement instructions.
@@ -157,11 +155,11 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) {
   }
   generators_[tuple] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    llvm::Value* ret = llvm::UndefValue::get(llvm::StructType::get(
-        ir_builder_->getContext(), operand_elemental_ir_types));
+    llvm::Value* ret = llvm::UndefValue::get(
+        llvm::StructType::get(b_->getContext(), operand_elemental_ir_types));
     for (size_t i = 0; i < ShapeUtil::TupleElementCount(tuple->shape()); ++i) {
       TF_ASSIGN_OR_RETURN(llvm::Value * val_i, generators_[operands[i]](index));
-      ret = ir_builder_->CreateInsertValue(ret, val_i, i);
+      ret = b_->CreateInsertValue(ret, val_i, i);
     }
     return ret;
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index a6ceec7b230e9c4d24ba18aa1557a4624a37a0b4..30471480c4fb3ce3bf3226a28e9d2ffa79ae5f29 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -59,7 +59,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
       : parameter_arrays_(parameter_arrays),
         tiled_parameter_info_(nullptr),
         elemental_emitter_(elemental_emitter),
-        ir_builder_(elemental_emitter->ir_builder()),
+        b_(elemental_emitter->b()),
         module_(elemental_emitter->module()) {}
 
   Status DefaultAction(HloInstruction* hlo) override;
@@ -103,7 +103,7 @@ class FusedIrEmitter : public DfsHloVisitorWithDefault {
   const HloInstruction* fused_root_ = nullptr;
 
   // Borrowed
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm::Module* module_;
 
   // Map from instruction pointers to functions to generate elements of their
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index dcf9838d8043fb716bed2990106f058e3bec3345..2b6caee6aa72f426cf85c8c56c3ef500ff8c5d3d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -31,7 +31,7 @@ namespace llvm_ir {
 
 void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
                                  llvm::Value* linear, const Shape& shape,
-                                 llvm::IRBuilder<>* ir_builder) const {
+                                 llvm::IRBuilder<>* b) const {
   int64 divisor = 1;
   const Layout& layout = shape.layout();
   for (int64 i = 0; i < layout.minor_to_major_size(); ++i) {
@@ -48,10 +48,9 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
     // useful because cuda-memcheck can't help us much in XLA: Most of our
     // memory lives in one big allocation, so cuda-memcheck can't detect
     // out-of-bounds accesses.
-    auto* quot =
-        ir_builder->CreateUDiv(linear, GetConstantWithIndexType(divisor));
+    auto* quot = b->CreateUDiv(linear, GetConstantWithIndexType(divisor));
     if (i < layout.minor_to_major_size() - 1) {
-      (*multidim)[dimension] = ir_builder->CreateURem(
+      (*multidim)[dimension] = b->CreateURem(
           quot, GetConstantWithIndexType(size_of_current_dimension));
     } else {
       (*multidim)[dimension] = quot;
@@ -61,7 +60,7 @@ void IrArray::Index::Delinearize(std::vector<llvm::Value*>* multidim,
 }
 
 IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
-                      llvm::IRBuilder<>* ir_builder)
+                      llvm::IRBuilder<>* b)
     : multidim_(ShapeUtil::Rank(shape)),
       linear_(linear),
       layout_(shape.layout()),
@@ -71,7 +70,7 @@ IrArray::Index::Index(llvm::Value* linear, const Shape& shape,
   CHECK(LayoutUtil::HasLayout(shape))
       << "Shape " << ShapeUtil::HumanStringWithLayout(shape)
       << " should have a layout.";
-  Delinearize(&multidim_, linear, shape, ir_builder);
+  Delinearize(&multidim_, linear, shape, b);
 }
 
 IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
@@ -94,7 +93,7 @@ IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
 }
 
 IrArray::Index::Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
-                      const Shape& shape, llvm::IRBuilder<>* ir_builder)
+                      const Shape& shape, llvm::IRBuilder<>* b)
     : multidim_(multidim.begin(), multidim.end()),
       layout_(shape.layout()),
       dims_(shape.dimensions().begin(), shape.dimensions().end()) {
@@ -328,6 +327,7 @@ llvm::Value* IrArray::Index::Linearize(
     llvm::IRBuilder<>* builder) const {
   // Each dimension is multiplied by the product of the sizes of all
   // earlier dimensions and added to the accumulator logical_linear_index.
+  CHECK_EQ(size(), dimensions.size());
   llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
   int64 multiplier = 1;
   for (ssize_t i = size() - 1; i >= 0; --i) {
@@ -343,7 +343,7 @@ llvm::Value* IrArray::Index::Linearize(
 }
 
 llvm::Value* IrArray::EmitArrayElementAddress(
-    const IrArray::Index& index, llvm::IRBuilder<>* ir_builder,
+    const IrArray::Index& index, llvm::IRBuilder<>* b,
     tensorflow::StringPiece name) const {
   if (ShapeUtil::IsScalar(*shape_)) {
     // Special handling of scalars: a scalar pretends to have the same value for
@@ -354,12 +354,11 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
 
   if (index.LinearValidOnShape(*shape_)) {
-    llvm::Module* module =
-        ir_builder->GetInsertBlock()->getParent()->getParent();
-    return ir_builder->CreateInBoundsGEP(
-        ir_builder->CreateBitCast(
-            base_ptr_, PrimitiveTypeToIrType(shape_->element_type(), module)
-                           ->getPointerTo()),
+    llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    return b->CreateInBoundsGEP(
+        b->CreateBitCast(base_ptr_,
+                         PrimitiveTypeToIrType(shape_->element_type(), module)
+                             ->getPointerTo()),
         {index.linear()}, llvm_ir::AsStringRef(name));
   }
 
@@ -385,8 +384,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(
     int64 dimension = LayoutUtil::Major(shape_->layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
-  return ir_builder->CreateInBoundsGEP(base_ptr_, gep_indices,
-                                       llvm_ir::AsStringRef(name));
+  return b->CreateInBoundsGEP(base_ptr_, gep_indices,
+                              llvm_ir::AsStringRef(name));
 }
 
 void IrArray::AnnotateLoadStoreInstructionWithMetadata(
@@ -402,29 +401,27 @@ void IrArray::AnnotateLoadStoreInstructionWithMetadata(
 }
 
 llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
-                                           llvm::IRBuilder<>* ir_builder,
+                                           llvm::IRBuilder<>* b,
                                            tensorflow::StringPiece name) const {
-  llvm::Value* element_address =
-      EmitArrayElementAddress(index, ir_builder, name);
-  llvm::LoadInst* load = ir_builder->CreateLoad(element_address);
+  llvm::Value* element_address = EmitArrayElementAddress(index, b, name);
+  llvm::LoadInst* load = b->CreateLoad(element_address);
   AnnotateLoadStoreInstructionWithMetadata(load);
   return load;
 }
 
 void IrArray::EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                                    llvm::IRBuilder<>* ir_builder) const {
-  llvm::Value* element_address = EmitArrayElementAddress(index, ir_builder);
-  llvm::StoreInst* store = ir_builder->CreateStore(value, element_address);
+                                    llvm::IRBuilder<>* b) const {
+  llvm::Value* element_address = EmitArrayElementAddress(index, b);
+  llvm::StoreInst* store = b->CreateStore(value, element_address);
   AnnotateLoadStoreInstructionWithMetadata(store);
 }
 
 IrArray IrArray::CastToShape(const Shape& new_shape,
-                             llvm::IRBuilder<>* ir_builder) const {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getParent()->getParent();
+                             llvm::IRBuilder<>* b) const {
+  llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
   llvm::Type* new_ir_type = llvm_ir::ShapeToIrType(new_shape, module);
   IrArray new_irarray(
-      ir_builder->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()),
-      new_shape);
+      b->CreatePointerCast(base_ptr_, new_ir_type->getPointerTo()), new_shape);
   new_irarray.metadata_ = metadata_;
   return new_irarray;
 }
@@ -432,9 +429,9 @@ IrArray IrArray::CastToShape(const Shape& new_shape,
 /* static */ IrArray::Index IrArray::BumpIndex(const Index& index,
                                                int64 which_dimension,
                                                int64 addend,
-                                               llvm::IRBuilder<>* ir_builder) {
+                                               llvm::IRBuilder<>* b) {
   Index new_index = index;
-  new_index[which_dimension] = ir_builder->CreateAdd(
+  new_index[which_dimension] = b->CreateAdd(
       index[which_dimension],
       llvm::ConstantInt::get(index[which_dimension]->getType(), addend), "",
       /*HasNUW=*/true,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 0777c499238edc6091fe637d2b6b3a1f4a347254..28ca793e3eeaed86664bfa6aa859a38f2c4dc6f3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -87,20 +87,19 @@ class IrArray {
     }
 
     // Constructs an index from linear index "linear" and computes the
-    // multi-dimensional index from "linear" and "shape". "ir_builder" is the IR
+    // multi-dimensional index from "linear" and "shape". "b" is the IR
     // builder to emit the index of each dimension in the multi-dimensional
     // index.
     //
     // Precondition: "shape" has a layout.
-    Index(llvm::Value* linear, const Shape& shape,
-          llvm::IRBuilder<>* ir_builder);
+    Index(llvm::Value* linear, const Shape& shape, llvm::IRBuilder<>* b);
 
     // Constructs an index from the given multi-dimensional index and the shape
     // that it indexes into.
     //
     // Precondition: "shape" has a layout.
     Index(tensorflow::gtl::ArraySlice<llvm::Value*> multidim,
-          const Shape& shape, llvm::IRBuilder<>* ir_builder);
+          const Shape& shape, llvm::IRBuilder<>* b);
 
     // Constructs an index from both a multi-dimensional index and a linear
     // index. "shape" has the same meaning as that in the constructor that takes
@@ -191,7 +190,7 @@ class IrArray {
     }
 
     void Delinearize(std::vector<llvm::Value*>* multidim, llvm::Value* linear,
-                     const Shape& shape, llvm::IRBuilder<>* ir_builder) const;
+                     const Shape& shape, llvm::IRBuilder<>* b) const;
 
     std::vector<llvm::Value*> multidim_;
 
@@ -240,8 +239,7 @@ class IrArray {
   //
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
-  llvm::Value* EmitArrayElementAddress(const Index& index,
-                                       llvm::IRBuilder<>* ir_builder,
+  llvm::Value* EmitArrayElementAddress(const Index& index, llvm::IRBuilder<>* b,
                                        tensorflow::StringPiece name = "") const;
 
   // Attach metadata this IrArray instance knows about to "instruction".
@@ -255,18 +253,16 @@ class IrArray {
   //
   // The optional name is useful for debugging when looking at
   // the emitted LLVM IR.
-  llvm::Value* EmitReadArrayElement(const Index& index,
-                                    llvm::IRBuilder<>* ir_builder,
+  llvm::Value* EmitReadArrayElement(const Index& index, llvm::IRBuilder<>* b,
                                     tensorflow::StringPiece name = "") const;
 
   // Emit IR to write the given value to the array element at the given index.
   void EmitWriteArrayElement(const Index& index, llvm::Value* value,
-                             llvm::IRBuilder<>* ir_builder) const;
+                             llvm::IRBuilder<>* b) const;
 
   // Returns a new IrArray whose shape is "new_shape" and base pointer is a
   // bitcast of the base pointer of "this" IrArray.
-  IrArray CastToShape(const Shape& new_shape,
-                      llvm::IRBuilder<>* ir_builder) const;
+  IrArray CastToShape(const Shape& new_shape, llvm::IRBuilder<>* b) const;
 
   void AddAliasScopeMetadata(llvm::MDNode* alias_scope) {
     CHECK_NE(alias_scope, nullptr);
@@ -312,7 +308,7 @@ class IrArray {
   // Bumps the "which_dimension" value within the provided index by the provided
   // addend.
   static Index BumpIndex(const Index& index, int64 which_dimension,
-                         int64 addend, llvm::IRBuilder<>* ir_builder);
+                         int64 addend, llvm::IRBuilder<>* b);
 
  private:
   // Add the specified LLVM IR metadata to loads/stores associated with this
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 98d0ceb3e2065890d6a7eba8b61fa369720332f8..b79567369aa532c4963e3941f6cb9844cd1476dd 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -22,9 +22,9 @@ Status KernelSupportLibrary::For(
     tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
     const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
-  return If(ir_builder_->CreateICmpSLT(start, end), [&]() -> Status {
+  return If(b_->CreateICmpSLT(start, end), [&]() -> Status {
     TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
-    return For(name, ir_builder_->CreateAdd(start, step), end, step,
+    return For(name, b_->CreateAdd(start, step), end, step,
                [&](llvm::Value* iv) { return for_body_generator(iv, false); });
   });
 }
@@ -37,20 +37,20 @@ Status KernelSupportLibrary::For(
   if (peel_first_iteration) {
     return For(name, start, end, step, true,
                [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
-                 return for_body_generator(
-                     indvar, ir_builder_->getInt1(is_first_iteration));
+                 return for_body_generator(indvar,
+                                           b_->getInt1(is_first_iteration));
                });
   } else {
     std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
-        name, start, end, step, ir_builder_,
+        name, start, end, step, b_,
         /*unroll_mode=*/unroll_mode_,
         /*prevent_vectorization=*/prevent_vectorization_);
-    ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
+    b_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
     TF_RETURN_IF_ERROR(
         for_body_generator(loop->GetIndVarValue(),
-                           /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
+                           /*is_first_iteration=*/b_->CreateICmpEQ(
                                loop->GetIndVarValue(), start)));
-    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_);
+    llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), b_);
     return Status::OK();
   }
 }
@@ -59,23 +59,22 @@ Status KernelSupportLibrary::If(
     tensorflow::StringPiece name, llvm::Value* condition,
     const std::function<Status()>& true_block_generator,
     const std::function<Status()>& false_block_generator) {
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(condition, name, ir_builder_);
-  ir_builder_->SetInsertPoint(&if_data.true_block->back());
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(condition, name, b_);
+  b_->SetInsertPoint(&if_data.true_block->back());
   TF_RETURN_IF_ERROR(true_block_generator());
-  ir_builder_->SetInsertPoint(&if_data.false_block->back());
+  b_->SetInsertPoint(&if_data.false_block->back());
   TF_RETURN_IF_ERROR(false_block_generator());
-  llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
+  llvm_ir::SetToLastInsertPoint(if_data.after_block, b_);
   return Status::OK();
 }
 
 void KernelSupportLibrary::EmitAndCallOutlinedKernel(
-    bool enable_fast_math, bool optimize_for_size,
-    llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
+    bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+    tensorflow::StringPiece kernel_name,
     KernelSupportLibrary::ArgumentVector arguments,
     const std::function<void(KernelSupportLibrary::ArgumentVector)>&
         kernel_body_generator) {
-  llvm::Module* module = ir_builder->GetInsertBlock()->getModule();
+  llvm::Module* module = b->GetInsertBlock()->getModule();
   llvm::Function* function =
       module->getFunction(llvm_ir::AsStringRef(kernel_name));
 
@@ -98,22 +97,22 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
                    std::back_inserter(arg_types),
                    [](llvm::Value* arg) { return arg->getType(); });
 
-    auto* function_type = llvm::FunctionType::get(
-        ir_builder->getVoidTy(), arg_types, /*isVarArg=*/false);
+    auto* function_type =
+        llvm::FunctionType::get(b->getVoidTy(), arg_types, /*isVarArg=*/false);
 
     function = llvm_ir::CreateFunction(
         function_type, llvm::GlobalValue::InternalLinkage,
         /*enable_fast_math=*/enable_fast_math,
         /*optimize_for_size=*/optimize_for_size, kernel_name, module);
 
-    llvm::IRBuilder<>::InsertPointGuard guard(*ir_builder);
+    llvm::IRBuilder<>::InsertPointGuard guard(*b);
 
     auto* entry_bb =
-        llvm::BasicBlock::Create(ir_builder->getContext(), "entry", function);
-    auto* return_inst = llvm::ReturnInst::Create(ir_builder->getContext(),
+        llvm::BasicBlock::Create(b->getContext(), "entry", function);
+    auto* return_inst = llvm::ReturnInst::Create(b->getContext(),
                                                  /*retVal=*/nullptr, entry_bb);
     // Set the insert point to before return_inst.
-    ir_builder->SetInsertPoint(return_inst);
+    b->SetInsertPoint(return_inst);
 
     std::vector<llvm::Value*> arg_values;
     /*
@@ -133,7 +132,7 @@ void KernelSupportLibrary::EmitAndCallOutlinedKernel(
     VLOG(3) << "Re-using kernel for " << kernel_name;
   }
 
-  ir_builder->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
+  b->CreateCall(function, llvm_ir::AsArrayRef(sanitized_args));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 9d770cc4c309d40222108833176ac5dad46754fe..b00f903d56a83c5b76188007702470c44c55c213 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -30,14 +30,14 @@ namespace xla {
 // flow more readable.
 class KernelSupportLibrary {
  public:
-  // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR.
+  // `b` is the llvm::IRBuilder instance used to generate LLVM IR.
   // `unroll_mode` specifies the desired LLVM unrolling behavior for every loop
   // generated by this instance of KernelSupportLibrary.
   explicit KernelSupportLibrary(
-      llvm::IRBuilder<>* ir_builder,
+      llvm::IRBuilder<>* b,
       llvm_ir::UnrollMode unroll_mode = llvm_ir::UnrollMode::kNoUnroll,
       bool prevent_vectorization = true)
-      : ir_builder_(ir_builder),
+      : b_(b),
         unroll_mode_(unroll_mode),
         prevent_vectorization_(prevent_vectorization) {}
 
@@ -71,18 +71,18 @@ class KernelSupportLibrary {
              const std::function<Status(llvm::Value* ind_var,
                                         bool is_first_iteration)>&
                  for_body_generator) {
-    return For(name, /*start=*/ir_builder_->getInt64(start),
-               /*end=*/ir_builder_->getInt64(end),
-               /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    return For(name, /*start=*/b_->getInt64(start),
+               /*end=*/b_->getInt64(end),
+               /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
-                  /*end=*/ir_builder_->getInt64(end),
-                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/b_->getInt64(start),
+                  /*end=*/b_->getInt64(end),
+                  /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure if `peel_first_iteration` is
@@ -184,17 +184,17 @@ class KernelSupportLibrary {
   Status For(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
-    return For(name, /*start=*/ir_builder_->getInt64(start),
-               /*end=*/ir_builder_->getInt64(end),
-               /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    return For(name, /*start=*/b_->getInt64(start),
+               /*end=*/b_->getInt64(end),
+               /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
-                  /*end=*/ir_builder_->getInt64(end),
-                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/b_->getInt64(start),
+                  /*end=*/b_->getInt64(end),
+                  /*step=*/b_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure:
@@ -258,41 +258,39 @@ class KernelSupportLibrary {
   // in a nullptr llvm::Value* in its position to `kernel_body_generator`.
   // Currently we only support at most one nullptr value in `arguments`.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      ArgumentVector arguments,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, ArgumentVector arguments,
       const std::function<void(ArgumentVector)>& kernel_body_generator);
 
   // Thin wrappers around the more general EmitAndCallOutlinedKernel above.
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*)>&
           kernel_body_generator) {
     EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
-        {arg0, arg1, arg2}, [&](ArgumentVector args) {
+        enable_fast_math, optimize_for_size, b, kernel_name, {arg0, arg1, arg2},
+        [&](ArgumentVector args) {
           kernel_body_generator(args[0], args[1], args[2]);
         });
   }
 
   static void EmitAndCallOutlinedKernel(
-      bool enable_fast_math, bool optimize_for_size,
-      llvm::IRBuilder<>* ir_builder, tensorflow::StringPiece kernel_name,
-      llvm::Value* arg0, llvm::Value* arg1, llvm::Value* arg2,
-      llvm::Value* arg3,
+      bool enable_fast_math, bool optimize_for_size, llvm::IRBuilder<>* b,
+      tensorflow::StringPiece kernel_name, llvm::Value* arg0, llvm::Value* arg1,
+      llvm::Value* arg2, llvm::Value* arg3,
       const std::function<void(llvm::Value*, llvm::Value*, llvm::Value*,
                                llvm::Value*)>& kernel_body_generator) {
     EmitAndCallOutlinedKernel(
-        enable_fast_math, optimize_for_size, ir_builder, kernel_name,
+        enable_fast_math, optimize_for_size, b, kernel_name,
         {arg0, arg1, arg2, arg3}, [&](ArgumentVector args) {
           kernel_body_generator(args[0], args[1], args[2], args[3]);
         });
   }
 
  private:
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
   llvm_ir::UnrollMode unroll_mode_;
   bool prevent_vectorization_;
 };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index 533b75cdae00dbc8244e502eb78adaf6f808b62e..35b394127288d816952b48c84b193257bab0bcda 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -94,24 +94,24 @@ tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(
 IrArray::Index GetUnreducedOutputIndex(
     const IrArray::Index& reduced_output_index,
     const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* ir_builder) {
+    llvm::IRBuilder<>* b) {
   auto bounds = reduced_output_shape.dimensions();
   auto minor_to_major = reduced_output_shape.layout().minor_to_major();
   llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
   int64 multiplier = 1;
   for (int i = 0; i < reduced_output_index.size(); ++i) {
     int64 dim = minor_to_major[i];
-    llvm::Value* addend = ir_builder->CreateMul(
-        reduced_output_index[dim],
-        reduced_output_index.GetConstantWithIndexType(multiplier),
-        "linearizing",
-        /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = ir_builder->CreateAdd(linear_index, addend, "",
-                                         /*HasNUW=*/true, /*HasNSW=*/true);
+    llvm::Value* addend =
+        b->CreateMul(reduced_output_index[dim],
+                     reduced_output_index.GetConstantWithIndexType(multiplier),
+                     "linearizing",
+                     /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
     multiplier *= bounds[dim];
   }
 
-  return IrArray::Index(linear_index, unreduced_output_shape, ir_builder);
+  return IrArray::Index(linear_index, unreduced_output_shape, b);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 6f1268fffbe5425d35142512d89871c6fb35db41..ccb9b8ba3e6b0079664f2da92ce67224e176fa1d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -44,7 +44,7 @@ tensorflow::gtl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
 IrArray::Index GetUnreducedOutputIndex(
     const IrArray::Index& reduced_output_index,
     const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index c9ae7d3afd5cdc21157732f6d0dfa824268e86bd..ba7f94834c7fd04d97cec012537244323308b8ce 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -47,27 +47,27 @@ ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
     tensorflow::StringPiece prefix, llvm::Value* start_index,
-    llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
+    llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
     UnrollMode unroll_mode, bool prevent_vectorization) {
   std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
                                             end_index, step, unroll_mode,
                                             prevent_vectorization));
-  loop->Emit(ir_builder);
+  loop->Emit(b);
   return loop;
 }
 
-void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
+void ForLoop::Emit(llvm::IRBuilder<>* b) {
   // The preheader block is the block the builder is currently emitting
   // code into.
-  preheader_bb_ = ir_builder->GetInsertBlock();
+  preheader_bb_ = b->GetInsertBlock();
 
-  llvm::BasicBlock::iterator insert_point = ir_builder->GetInsertPoint();
+  llvm::BasicBlock::iterator insert_point = b->GetInsertPoint();
   if (insert_point == preheader_bb_->end()) {
     // We're emitting the loop at the end of a basic block. Verify there is no
     // terminator (eg, branch) in the basic block.
     CHECK_EQ(nullptr, preheader_bb_->getTerminator());
 
-    exit_bb_ = CreateLoopBB("loop_exit", ir_builder);
+    exit_bb_ = CreateLoopBB("loop_exit", b);
   } else {
     // We're emitting the loop into the middle of a basic block. splitBasicBlock
     // requires that this basic block be well-formed (have a terminator).
@@ -86,51 +86,50 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   insert_before_bb_ = exit_bb_;
 
   // Create remaining basic block which form the inside of the loop.
-  header_bb_ = CreateLoopBB("loop_header", ir_builder);
-  body_bb_ = CreateLoopBB("loop_body", ir_builder);
+  header_bb_ = CreateLoopBB("loop_header", b);
+  body_bb_ = CreateLoopBB("loop_body", b);
 
   // Function entry basic block.
   // Emit alloca for the induction variable. We do this at the entry to the
   // basic block to ensure the alloc only executes once per function (we could
   // be emitting a nested loop).
   llvm::Function* func = preheader_bb_->getParent();
-  ir_builder->SetInsertPoint(&func->getEntryBlock(),
-                             func->getEntryBlock().getFirstInsertionPt());
+  b->SetInsertPoint(&func->getEntryBlock(),
+                    func->getEntryBlock().getFirstInsertionPt());
   llvm::Value* indvar_address =
-      ir_builder->CreateAlloca(start_index_->getType(), nullptr,
-                               AsStringRef(GetQualifiedName("invar_address")));
+      b->CreateAlloca(start_index_->getType(), nullptr,
+                      AsStringRef(GetQualifiedName("invar_address")));
 
   // Preheader basic block.
   // Initialize induction variable starting index. Create branch to the header.
-  ir_builder->SetInsertPoint(preheader_bb_);
-  ir_builder->CreateStore(start_index_, indvar_address);
+  b->SetInsertPoint(preheader_bb_);
+  b->CreateStore(start_index_, indvar_address);
   // The preheader should not have a branch yet.
   CHECK_EQ(preheader_bb_->getTerminator(), nullptr);
-  ir_builder->CreateBr(header_bb_);
+  b->CreateBr(header_bb_);
 
   // Header basic block.
   // Emit the loop conditional branch. Load and compare indvar with ending
   // index and jump to loop exit if equal. Jump to body otherwise.
-  ir_builder->SetInsertPoint(header_bb_);
-  indvar_ = ir_builder->CreateLoad(indvar_address,
-                                   AsStringRef(GetQualifiedName("indvar")));
-  llvm::Value* exit_cond = ir_builder->CreateICmpUGE(indvar_, end_index_);
-  ir_builder->CreateCondBr(/*Cond=*/exit_cond,
-                           /*True=*/exit_bb_, /*False=*/body_bb_);
+  b->SetInsertPoint(header_bb_);
+  indvar_ =
+      b->CreateLoad(indvar_address, AsStringRef(GetQualifiedName("indvar")));
+  llvm::Value* exit_cond = b->CreateICmpUGE(indvar_, end_index_);
+  b->CreateCondBr(/*Cond=*/exit_cond,
+                  /*True=*/exit_bb_, /*False=*/body_bb_);
 
   // Body basic block.
   // Increment indvar, store indvar, and jump to header.
-  ir_builder->SetInsertPoint(body_bb_);
+  b->SetInsertPoint(body_bb_);
   llvm::Value* step = step_;
   llvm::Value* indvar = indvar_;
 
-  llvm::Value* indvar_inc =
-      ir_builder->CreateAdd(indvar, step, "invar.inc",
-                            /*HasNUW=*/true, /*HasNSW=*/true);
-  ir_builder->CreateStore(indvar_inc, indvar_address);
-  llvm::BranchInst* back_branch = ir_builder->CreateBr(header_bb_);
+  llvm::Value* indvar_inc = b->CreateAdd(indvar, step, "invar.inc",
+                                         /*HasNUW=*/true, /*HasNSW=*/true);
+  b->CreateStore(indvar_inc, indvar_address);
+  llvm::BranchInst* back_branch = b->CreateBr(header_bb_);
 
-  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(ir_builder);
+  std::vector<llvm::Metadata*> loop_metadata = GetLoopMetadata(b);
   if (!loop_metadata.empty()) {
     llvm::LLVMContext* ctx = &start_index_->getContext();
     auto temp_node = llvm::MDNode::getTemporary(*ctx, llvm::None);
@@ -141,11 +140,10 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
   }
 
   // Re-point the IR builder to the loop exit block.
-  ir_builder->SetInsertPoint(exit_bb_);
+  b->SetInsertPoint(exit_bb_);
 }
 
-std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
-    llvm::IRBuilder<>* ir_builder) {
+std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(llvm::IRBuilder<>* b) {
   const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
   const char* const kLlvmLoopUnrollFullMDName = "llvm.loop.unroll.full";
   const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable";
@@ -160,7 +158,7 @@ std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
   if (prevent_vectorization_) {
     result.push_back(llvm::MDNode::get(
         *ctx, {llvm::MDString::get(*ctx, kLlvmLoopVectorizeMDName),
-               llvm::ConstantAsMetadata::get(ir_builder->getFalse())}));
+               llvm::ConstantAsMetadata::get(b->getFalse())}));
   }
 
   if (unroll_mode_ == xla::llvm_ir::UnrollMode::kFullyUnroll) {
@@ -175,9 +173,8 @@ string ForLoop::GetQualifiedName(tensorflow::StringPiece name) {
 }
 
 llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
-                                        llvm::IRBuilder<>* ir_builder) {
-  return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name),
-                          ir_builder);
+                                        llvm::IRBuilder<>* b) {
+  return CreateBasicBlock(insert_before_bb_, GetQualifiedName(name), b);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
@@ -197,12 +194,12 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               bool prevent_vectorization) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
-    ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
+    b_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
   std::unique_ptr<ForLoop> loop(new ForLoop(
       /*prefix=*/name_, suffix, start_index, end_index, stride, unroll_mode,
       prevent_vectorization));
-  loop->Emit(ir_builder_);
+  loop->Emit(b_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
     outer_loop_preheader_bb_ = loop->GetPreheaderBasicBlock();
@@ -262,5 +259,35 @@ IrArray::Index ForLoopNest::AddLoopsForShapeOnDimensions(
   return index;
 }
 
+IrArray::Index ForLoopNest::EmitOperandArrayLoopNest(
+    const llvm_ir::IrArray& operand_array, int64 dimension_to_skip,
+    tensorflow::StringPiece name_suffix) {
+  // Prepares the dimension list we will use to emit the loop nest. Outermost
+  // loops are added first. Add loops in major-to-minor order, and skip the
+  // 'dimension_to_skip' dimension.
+  std::vector<int64> dimensions;
+  const Shape& shape = operand_array.GetShape();
+  for (int64 dimension : LayoutUtil::MinorToMajor(shape)) {
+    if (dimension != dimension_to_skip) {
+      dimensions.push_back(dimension);
+    }
+  }
+
+  // Create loop nest with one for-loop for each dimension of the
+  // output.
+  llvm_ir::IrArray::Index index =
+      AddLoopsForShapeOnDimensions(shape, dimensions, name_suffix);
+  // Verify every dimension except the 'dimension_to_skip' dimension was set in
+  // the index.
+  for (size_t dimension = 0; dimension < index.size(); ++dimension) {
+    if (dimension == dimension_to_skip) {
+      DCHECK_EQ(nullptr, index[dimension]);
+    } else {
+      DCHECK_NE(nullptr, index[dimension]);
+    }
+  }
+  return index;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 0dd5b9d3b2656af68f76c2adfcb1f3a1385eeb91..a4fed5c8dc55d38d25031252e3960404a5bf84e6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -79,7 +79,7 @@ class ForLoop {
   //  loop.
   static std::unique_ptr<ForLoop> EmitForLoop(
       tensorflow::StringPiece prefix, llvm::Value* start_index,
-      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
+      llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* b,
       UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll,
       bool prevent_vectorization = false);
 
@@ -138,10 +138,10 @@ class ForLoop {
           UnrollMode unroll_mode, bool prevent_vectorization);
 
   // Emit the loop at the insert point of the builder.
-  void Emit(llvm::IRBuilder<>* ir_builder);
+  void Emit(llvm::IRBuilder<>* b);
 
   llvm::BasicBlock* CreateLoopBB(tensorflow::StringPiece name,
-                                 llvm::IRBuilder<>* ir_builder);
+                                 llvm::IRBuilder<>* b);
 
   // Creates a name for an LLVM construct, appending prefix_ and suffix_, if
   // they are set.
@@ -149,7 +149,7 @@ class ForLoop {
 
   // Return a list of metadata nodes that should be associated with the
   // llvm::Loop for this `ForLoop`.
-  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* ir_builder);
+  std::vector<llvm::Metadata*> GetLoopMetadata(llvm::IRBuilder<>* b);
 
   string prefix_;
   string suffix_;
@@ -177,19 +177,18 @@ class ForLoop {
 // A simple class for constructing nested for-loops.
 class ForLoopNest {
  public:
-  explicit ForLoopNest(llvm::IRBuilder<>* ir_builder,
-                       llvm::Type* index_ty = nullptr)
-      : ForLoopNest(/*name=*/"", ir_builder) {
+  explicit ForLoopNest(llvm::IRBuilder<>* b, llvm::Type* index_ty = nullptr)
+      : ForLoopNest(/*name=*/"", b) {
     SetIndexType(index_ty);
   }
 
-  ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder,
+  ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* b,
               llvm::Type* index_ty = nullptr)
       : name_(std::string(name)),
         outer_loop_preheader_bb_(nullptr),
         outer_loop_exit_bb_(nullptr),
         inner_loop_body_bb_(nullptr),
-        ir_builder_(ir_builder) {
+        b_(b) {
     SetIndexType(index_ty);
   }
 
@@ -248,6 +247,17 @@ class ForLoopNest {
       const Shape& shape, tensorflow::gtl::ArraySlice<int64> dimensions,
       tensorflow::StringPiece suffix);
 
+  // Emits a series of nested loops for iterating over an operand array. Loops
+  // are constructed in major to minor dimension layout order. No loop is
+  // emitted for the given 'dimension_to_skip'. The function returns an IrArray
+  // index for the given operand_array containing the indvars of the loops. All
+  // dimensions of the index are filled except for 'dimension_to_skip'.
+  // name_suffix is the string to append to the names of LLVM constructs (eg,
+  // basic blocks) constructed by this method.
+  IrArray::Index EmitOperandArrayLoopNest(const llvm_ir::IrArray& operand_array,
+                                          int64 dimension_to_skip,
+                                          tensorflow::StringPiece name_suffix);
+
   // Convenience methods which return particular basic blocks of the outermost
   // or innermost loops. These methods return nullptr if no loops have been
   // added yet.
@@ -259,7 +269,7 @@ class ForLoopNest {
 
  private:
   void SetIndexType(llvm::Type* index_ty) {
-    index_type_ = index_ty == nullptr ? ir_builder_->getInt64Ty() : index_ty;
+    index_type_ = index_ty == nullptr ? b_->getInt64Ty() : index_ty;
   }
 
   llvm::Constant* GetConstantWithIndexType(int64 c) const {
@@ -278,7 +288,7 @@ class ForLoopNest {
   // has been added yet.
   llvm::BasicBlock* inner_loop_body_bb_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 
   llvm::Type* index_type_;
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 6c55361b44b51d49cb47ea0690ff4954d4e93f84..e6126881af8b8123e08a4eaa934b52a7fd378ce6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -48,8 +48,8 @@ namespace {
 
 // Note, this function is only useful in an insertion context; in a global
 // (e.g. constants) context it will CHECK fail.
-llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* ir_builder) {
-  auto block = CHECK_NOTNULL(ir_builder->GetInsertBlock());
+llvm::Module* ModuleFromIRBuilder(llvm::IRBuilder<>* b) {
+  auto block = CHECK_NOTNULL(b->GetInsertBlock());
   auto fn = CHECK_NOTNULL(block->getParent());
   auto module = CHECK_NOTNULL(fn->getParent());
   return module;
@@ -87,41 +87,41 @@ llvm::Value* EmitCallToIntrinsic(
     llvm::Intrinsic::ID intrinsic_id,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
-    llvm::IRBuilder<>* ir_builder) {
-  llvm::Module* module = ModuleFromIRBuilder(ir_builder);
+    llvm::IRBuilder<>* b) {
+  llvm::Module* module = ModuleFromIRBuilder(b);
   llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
       module, intrinsic_id, AsArrayRef(overloaded_types));
-  return ir_builder->CreateCall(intrinsic, AsArrayRef(operands));
+  return b->CreateCall(intrinsic, AsArrayRef(operands));
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder) {
-  if (ir_builder->getFastMathFlags().noNaNs()) {
-    auto cmp = ir_builder->CreateFCmpUGE(lhs_value, rhs_value);
-    return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
+                          llvm::IRBuilder<>* b) {
+  if (b->getFastMathFlags().noNaNs()) {
+    auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
+    return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    auto cmp_ge = ir_builder->CreateFCmpOGE(lhs_value, rhs_value);
-    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = ir_builder->CreateOr(cmp_ge, lhs_is_nan);
-    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
+    auto cmp_ge = b->CreateFCmpOGE(lhs_value, rhs_value);
+    auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = b->CreateOr(cmp_ge, lhs_is_nan);
+    return b->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder) {
-  if (ir_builder->getFastMathFlags().noNaNs()) {
-    auto cmp = ir_builder->CreateFCmpULE(lhs_value, rhs_value);
-    return ir_builder->CreateSelect(cmp, lhs_value, rhs_value);
+                          llvm::IRBuilder<>* b) {
+  if (b->getFastMathFlags().noNaNs()) {
+    auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
+    return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
-    auto cmp_le = ir_builder->CreateFCmpOLE(lhs_value, rhs_value);
-    auto lhs_is_nan = ir_builder->CreateFCmpUNE(lhs_value, lhs_value);
-    auto sel_lhs = ir_builder->CreateOr(cmp_le, lhs_is_nan);
-    return ir_builder->CreateSelect(sel_lhs, lhs_value, rhs_value);
+    auto cmp_le = b->CreateFCmpOLE(lhs_value, rhs_value);
+    auto lhs_is_nan = b->CreateFCmpUNE(lhs_value, lhs_value);
+    auto sel_lhs = b->CreateOr(cmp_le, lhs_is_nan);
+    return b->CreateSelect(sel_lhs, lhs_value, rhs_value);
   }
 }
 
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
-                                   llvm::IRBuilder<>* ir_builder) {
+                                   llvm::IRBuilder<>* b) {
   llvm::Type* array_type = array->getType();
   CHECK(array_type->isPointerTy());
   llvm::PointerType* array_type_as_pointer =
@@ -131,16 +131,16 @@ llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
           << " array=" << llvm_ir::DumpToString(*array)
           << " index=" << llvm_ir::DumpToString(*index);
 
-  return ir_builder->CreateInBoundsGEP(
+  return b->CreateInBoundsGEP(
       array_type_as_pointer->getElementType(), array,
       llvm::isa<llvm::GlobalVariable>(array)
-          ? llvm::ArrayRef<llvm::Value*>({ir_builder->getInt64(0), index})
+          ? llvm::ArrayRef<llvm::Value*>({b->getInt64(0), index})
           : index);
 }
 
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index,
-                                   llvm::IRBuilder<>* ir_builder) {
-  return EmitBufferIndexingGEP(array, ir_builder->getInt64(index), ir_builder);
+                                   llvm::IRBuilder<>* b) {
+  return EmitBufferIndexingGEP(array, b->getInt64(index), b);
 }
 
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
@@ -232,14 +232,15 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module) {
   return result_type;
 }
 
-StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
-    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder) {
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
+                                                         int32* shape_size,
+                                                         llvm::IRBuilder<>* b) {
   string encoded_shape = shape.SerializeAsString();
   if (encoded_shape.size() > std::numeric_limits<int32>::max()) {
     return InternalError("Encoded shape size exceeded int32 size limit.");
   }
   *shape_size = static_cast<int32>(encoded_shape.size());
-  return ir_builder->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
+  return b->CreateGlobalStringPtr(llvm_ir::AsStringRef(encoded_shape));
 }
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
@@ -262,59 +263,57 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             tensorflow::StringPiece name,
-                                            llvm::IRBuilder<>* ir_builder,
+                                            llvm::IRBuilder<>* b,
                                             int alignment) {
-  return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, ir_builder,
-                                            alignment);
+  return EmitAllocaAtFunctionEntryWithCount(type, nullptr, name, b, alignment);
 }
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder, int alignment) {
-  llvm::IRBuilder<>::InsertPoint insert_point = ir_builder->saveIP();
-  llvm::Function* function = ir_builder->GetInsertBlock()->getParent();
-  ir_builder->SetInsertPoint(&function->getEntryBlock(),
-                             function->getEntryBlock().getFirstInsertionPt());
+    llvm::IRBuilder<>* b, int alignment) {
+  llvm::IRBuilder<>::InsertPoint insert_point = b->saveIP();
+  llvm::Function* function = b->GetInsertBlock()->getParent();
+  b->SetInsertPoint(&function->getEntryBlock(),
+                    function->getEntryBlock().getFirstInsertionPt());
   llvm::AllocaInst* alloca =
-      ir_builder->CreateAlloca(type, element_count, AsStringRef(name));
+      b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
     alloca->setAlignment(alignment);
   }
-  ir_builder->restoreIP(insert_point);
+  b->restoreIP(insert_point);
   return alloca;
 }
 
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
                                    tensorflow::StringPiece name,
-                                   llvm::IRBuilder<>* ir_builder) {
+                                   llvm::IRBuilder<>* b) {
   return llvm::BasicBlock::Create(
-      /*Context=*/ir_builder->getContext(),
+      /*Context=*/b->getContext(),
       /*Name=*/AsStringRef(name),
-      /*Parent=*/ir_builder->GetInsertBlock()->getParent(),
+      /*Parent=*/b->GetInsertBlock()->getParent(),
       /*InsertBefore*/ insert_before);
 }
 
 LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
-                          llvm::IRBuilder<>* ir_builder, bool emit_else) {
+                          llvm::IRBuilder<>* b, bool emit_else) {
   llvm_ir::LlvmIfData if_data;
-  if_data.if_block = ir_builder->GetInsertBlock();
-  if_data.true_block = CreateBasicBlock(
-      nullptr, tensorflow::strings::StrCat(name, "-true"), ir_builder);
+  if_data.if_block = b->GetInsertBlock();
+  if_data.true_block =
+      CreateBasicBlock(nullptr, tensorflow::strings::StrCat(name, "-true"), b);
   if_data.false_block =
-      emit_else ? CreateBasicBlock(nullptr,
-                                   tensorflow::strings::StrCat(name, "-false"),
-                                   ir_builder)
+      emit_else ? CreateBasicBlock(
+                      nullptr, tensorflow::strings::StrCat(name, "-false"), b)
                 : nullptr;
 
   // Add a terminator to the if block, if necessary.
   if (if_data.if_block->getTerminator() == nullptr) {
-    ir_builder->SetInsertPoint(if_data.if_block);
+    b->SetInsertPoint(if_data.if_block);
     if_data.after_block = CreateBasicBlock(
-        nullptr, tensorflow::strings::StrCat(name, "-after"), ir_builder);
-    ir_builder->CreateBr(if_data.after_block);
+        nullptr, tensorflow::strings::StrCat(name, "-after"), b);
+    b->CreateBr(if_data.after_block);
   } else {
     if_data.after_block = if_data.if_block->splitBasicBlock(
-        ir_builder->GetInsertPoint(),
+        b->GetInsertPoint(),
         AsStringRef(tensorflow::strings::StrCat(name, "-after")));
   }
 
@@ -322,39 +321,37 @@ LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
   // we're going to replace it with a conditional branch.
   if_data.if_block->getTerminator()->eraseFromParent();
 
-  ir_builder->SetInsertPoint(if_data.if_block);
-  ir_builder->CreateCondBr(
-      condition, if_data.true_block,
-      emit_else ? if_data.false_block : if_data.after_block);
+  b->SetInsertPoint(if_data.if_block);
+  b->CreateCondBr(condition, if_data.true_block,
+                  emit_else ? if_data.false_block : if_data.after_block);
 
-  ir_builder->SetInsertPoint(if_data.true_block);
-  ir_builder->CreateBr(if_data.after_block);
+  b->SetInsertPoint(if_data.true_block);
+  b->CreateBr(if_data.after_block);
 
   if (emit_else) {
-    ir_builder->SetInsertPoint(if_data.false_block);
-    ir_builder->CreateBr(if_data.after_block);
+    b->SetInsertPoint(if_data.false_block);
+    b->CreateBr(if_data.after_block);
   }
 
-  ir_builder->SetInsertPoint(if_data.after_block,
-                             if_data.after_block->getFirstInsertionPt());
+  b->SetInsertPoint(if_data.after_block,
+                    if_data.after_block->getFirstInsertionPt());
 
   return if_data;
 }
 
 llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
                             llvm::Value* lhs_value, llvm::Value* rhs_value,
-                            llvm::IRBuilder<>* ir_builder) {
+                            llvm::IRBuilder<>* b) {
   llvm::Value* comparison_result;
   if (lhs_value->getType()->isIntegerTy()) {
-    comparison_result = ir_builder->CreateICmp(predicate, lhs_value, rhs_value);
+    comparison_result = b->CreateICmp(predicate, lhs_value, rhs_value);
   } else {
-    comparison_result = ir_builder->CreateFCmp(predicate, lhs_value, rhs_value);
+    comparison_result = b->CreateFCmp(predicate, lhs_value, rhs_value);
   }
   // comparison_result is i1, but the NVPTX codegen incorrectly lowers i1
   // arrays. So we extend it to i8 so that it's addressable.
-  return ir_builder->CreateZExt(
-      comparison_result,
-      llvm_ir::PrimitiveTypeToIrType(PRED, ModuleFromIRBuilder(ir_builder)));
+  return b->CreateZExt(comparison_result, llvm_ir::PrimitiveTypeToIrType(
+                                              PRED, ModuleFromIRBuilder(b)));
 }
 
 // Internal helper that is called from emitted code to log an int64 value with a
@@ -363,17 +360,14 @@ static void LogS64(const char* tag, int64 value) {
   LOG(INFO) << tag << " (int64): " << value;
 }
 
-void EmitLogging(const char* tag, llvm::Value* value,
-                 llvm::IRBuilder<>* ir_builder) {
+void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b) {
   llvm::FunctionType* log_function_type = llvm::FunctionType::get(
-      ir_builder->getVoidTy(),
-      {ir_builder->getInt64Ty(), ir_builder->getInt64Ty()}, /*isVarArg=*/false);
-  ir_builder->CreateCall(
+      b->getVoidTy(), {b->getInt64Ty(), b->getInt64Ty()}, /*isVarArg=*/false);
+  b->CreateCall(
       log_function_type,
-      ir_builder->CreateIntToPtr(
-          ir_builder->getInt64(tensorflow::bit_cast<int64>(&LogS64)),
-          log_function_type->getPointerTo()),
-      {ir_builder->getInt64(tensorflow::bit_cast<int64>(tag)), value});
+      b->CreateIntToPtr(b->getInt64(tensorflow::bit_cast<int64>(&LogS64)),
+                        log_function_type->getPointerTo()),
+      {b->getInt64(tensorflow::bit_cast<int64>(tag)), value});
 }
 
 void SetAlignmentMetadataForLoad(llvm::LoadInst* load, uint64_t alignment) {
@@ -663,5 +657,56 @@ void InitializeLLVMCommandLineOptions(const HloModuleConfig& config) {
   }
 }
 
+std::pair<llvm::Value*, llvm::Value*> UMulLowHigh32(llvm::IRBuilder<>* b,
+                                                    llvm::Value* src0,
+                                                    llvm::Value* src1) {
+  CHECK_EQ(src0->getType()->getPrimitiveSizeInBits(), 32);
+  CHECK_EQ(src1->getType()->getPrimitiveSizeInBits(), 32);
+  llvm::Type* int64_ty = b->getInt64Ty();
+  src0 = b->CreateZExt(src0, int64_ty);
+  src1 = b->CreateZExt(src1, int64_ty);
+  return SplitInt64ToInt32s(b, b->CreateMul(src0, src1));
+}
+
+std::pair<llvm::Value*, llvm::Value*> SplitInt64ToInt32s(
+    llvm::IRBuilder<>* b, llvm::Value* value_64bits) {
+  CHECK_EQ(value_64bits->getType()->getPrimitiveSizeInBits(), 64);
+  llvm::Type* int32_ty = b->getInt32Ty();
+  llvm::Value* low_32bits = b->CreateTrunc(value_64bits, int32_ty);
+  llvm::Value* high_32bits =
+      b->CreateTrunc(b->CreateLShr(value_64bits, 32), int32_ty);
+  return std::make_pair(low_32bits, high_32bits);
+}
+
+llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState(
+    llvm::Module* module, llvm::IRBuilder<>* b) {
+  static const char* kPhiloxRngStateVariableName = "philox_rng_state";
+  llvm::GlobalVariable* state_ptr =
+      module->getNamedGlobal(kPhiloxRngStateVariableName);
+  if (!state_ptr) {
+    state_ptr = new llvm::GlobalVariable(
+        /*M=*/*module,
+        /*Ty=*/b->getInt64Ty(),
+        /*isConstant=*/false,
+        /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
+        /*Initializer=*/b->getInt64(0),
+        /*Name=*/kPhiloxRngStateVariableName);
+  }
+  return state_ptr;
+}
+
+void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module,
+                                        llvm::IRBuilder<>* builder) {
+  llvm::GlobalVariable* state_ptr =
+      GetOrCreateVariableForPhiloxRngState(module, builder);
+  llvm::Value* state_value_old = builder->CreateLoad(state_ptr, "load_state");
+  // If the 64-bit value overflows, we use the wraparound value. This should
+  // be fine in practice as we only add one to the value each time when a RNG is
+  // executed.
+  llvm::Value* state_value_new = builder->CreateAdd(
+      state_value_old, builder->getInt64(value), "inc_state");
+  builder->CreateStore(state_value_new, state_ptr);
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index 9c51861eacaafcbc120e4b5f3301fe208d4c7bff..09583985342033d486d50910b6f5ca732a9a3756 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -105,26 +105,26 @@ llvm::Value* EmitCallToIntrinsic(
     llvm::Intrinsic::ID intrinsic_id,
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
-    llvm::IRBuilder<>* ir_builder);
+    llvm::IRBuilder<>* b);
 
 // Emit float max. Emit maxnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder);
+                          llvm::IRBuilder<>* b);
 
 // Emit float min. Emit minnum intrinsic is fast math is disabled, or
 // fcmp+select otherwise
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
-                          llvm::IRBuilder<>* ir_builder);
+                          llvm::IRBuilder<>* b);
 
 // Convenience methods for emitting a GEP instruction that indexes into a buffer
 // (1-dimensional array), equivalent to array[index]. The type is automatically
 // determined from the element type of the array.  The int64 index overload
 // wraps the index in a i64 llvm::Value.
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, llvm::Value* index,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 llvm::Value* EmitBufferIndexingGEP(llvm::Value* array, int64 index,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 
 // Returns the LLVM type which represents the given XLA primitive type.
 llvm::Type* PrimitiveTypeToIrType(PrimitiveType element_type,
@@ -139,8 +139,9 @@ llvm::Type* ShapeToIrType(const Shape& shape, llvm::Module* module);
 
 // Returns a value that represents a pointer to a global string constant that
 // encodes the shape as a serialized protobuf.
-StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(
-    const Shape& shape, int32* shape_size, llvm::IRBuilder<>* ir_builder);
+StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
+                                                         int32* shape_size,
+                                                         llvm::IRBuilder<>* b);
 
 // Inverses the encoding of a Shape protobuf into an LLVM global variable.
 //
@@ -164,21 +165,21 @@ llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
 // through a loop.
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             tensorflow::StringPiece name,
-                                            llvm::IRBuilder<>* ir_builder,
+                                            llvm::IRBuilder<>* b,
                                             int alignment = 0);
 
 // As EmitAllocaAtFunctionEntry, but allocates element_count entries
 // instead of a single element.
 llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(
     llvm::Type* type, llvm::Value* element_count, tensorflow::StringPiece name,
-    llvm::IRBuilder<>* ir_builder, int alignment = 0);
+    llvm::IRBuilder<>* b, int alignment = 0);
 
 // Creates a basic block with the same context and function as for the
 // builder. Inserts at the end of the function if insert_before is
 // null.
 llvm::BasicBlock* CreateBasicBlock(llvm::BasicBlock* insert_before,
                                    tensorflow::StringPiece name,
-                                   llvm::IRBuilder<>* ir_builder);
+                                   llvm::IRBuilder<>* b);
 
 // Struct with data on a conditional branch in a diamond shape created
 // via EmitIfThenElse.
@@ -210,13 +211,13 @@ struct LlvmIfData {
 // block with a terminator. If you need to use this for a
 // non-terminated block, just make the function able to do that too.
 LlvmIfData EmitIfThenElse(llvm::Value* condition, tensorflow::StringPiece name,
-                          llvm::IRBuilder<>* ir_builder, bool emit_else = true);
+                          llvm::IRBuilder<>* b, bool emit_else = true);
 
 // Emits a compare operation between "lhs" and "rhs" with the given predicate,
 // and then converts the result to i8 so that it is addressable.
 llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
                             llvm::Value* lhs, llvm::Value* rhs,
-                            llvm::IRBuilder<>* ir_builder);
+                            llvm::IRBuilder<>* b);
 
 // Emits a call that logs the given value with the given tag as a prefix.
 // The provided tag and value are passed to a runtime logging call that is
@@ -228,8 +229,7 @@ llvm::Value* EmitComparison(llvm::CmpInst::Predicate predicate,
 // Precondition: value must be an int64.
 // Precondition: tag must be a stable pointer for the lifetime of the generated
 // program (the constant pointer is burned in to the program).
-void EmitLogging(const char* tag, llvm::Value* value,
-                 llvm::IRBuilder<>* ir_builder);
+void EmitLogging(const char* tag, llvm::Value* value, llvm::IRBuilder<>* b);
 
 // Adds alignment metadata to a load instruction using the given alignment.
 // The alignment refers to the result of the load, not the load itself.
@@ -292,6 +292,27 @@ llvm::Function* CreateFunction(llvm::FunctionType* function_type,
 // don't start with xla_ to LLVM.
 void InitializeLLVMCommandLineOptions(const HloModuleConfig& config);
 
+// Zero-extends two 32-bit values to 64 bits, multiplies them, and returns the
+// result as a pair of (low 32 bits, high 32 bits).
+std::pair<llvm::Value*, llvm::Value*> UMulLowHigh32(llvm::IRBuilder<>* b,
+                                                    llvm::Value* src0,
+                                                    llvm::Value* src1);
+// Splits the 64-bit integer value into its high and low 32 bits.
+std::pair<llvm::Value*, llvm::Value*> SplitInt64ToInt32s(
+    llvm::IRBuilder<>* b, llvm::Value* value_64bits);
+
+// Checks whether a global variable is already created to represent a
+// state passed between RNG calls implemented with Philox algorithm. If not,
+// creates such a variable. Returns the global variable.
+llvm::GlobalVariable* GetOrCreateVariableForPhiloxRngState(
+    llvm::Module* module, llvm::IRBuilder<>* b);
+
+// Adds a value to the global state variable each time when a RNG hlo is
+// executed. The value of this global state variable is added to the seed
+// of the Philox RNG algorithm so that calling the same RNG Hlo multiple times
+// should rarely produce the same result.
+void IncrementVariableForPhiloxRngState(int64 value, llvm::Module* module,
+                                        llvm::IRBuilder<>* b);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index e8b0605b9d75677b34f0973d88d269a5795b7629..36f5fa195224c20e30a14f72b32eb42a681bb5e9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -33,26 +33,24 @@ namespace xla {
 namespace llvm_ir {
 
 LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
-                         llvm::IRBuilder<>* ir_builder)
-    : body_emitter_(body_emitter), shape_(shape), ir_builder_(ir_builder) {}
+                         llvm::IRBuilder<>* b)
+    : body_emitter_(body_emitter), shape_(shape), b_(b) {}
 
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
-                         const IrArray& target_array,
-                         llvm::IRBuilder<>* ir_builder)
+                         const IrArray& target_array, llvm::IRBuilder<>* b)
     : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
         // Convert target_element_generator to a BodyEmitter.
         TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                             target_element_generator(array_index));
-        target_array.EmitWriteArrayElement(array_index, target_element,
-                                           ir_builder);
+        target_array.EmitWriteArrayElement(array_index, target_element, b);
         return Status::OK();
       }),
       shape_(target_array.GetShape()),
-      ir_builder_(ir_builder) {}
+      b_(b) {}
 
 static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
     const ElementGenerator& target_element_generator,
-    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* ir_builder) {
+    const std::vector<IrArray>& target_arrays, llvm::IRBuilder<>* b) {
   return [=](const llvm_ir::IrArray::Index array_index) {
     TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                         target_element_generator(array_index));
@@ -64,8 +62,7 @@ static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
 
     for (int64 i = 0; i < target_arrays.size(); ++i) {
       target_arrays[i].EmitWriteArrayElement(
-          array_index, ir_builder->CreateExtractValue(target_element, i),
-          ir_builder);
+          array_index, b->CreateExtractValue(target_element, i), b);
     }
     return Status::OK();
   };
@@ -73,13 +70,12 @@ static LoopEmitter::BodyEmitter MakeBodyEmitterForMultiOutputFusion(
 
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          tensorflow::gtl::ArraySlice<IrArray> target_arrays,
-                         llvm::IRBuilder<>* ir_builder)
+                         llvm::IRBuilder<>* b)
     : body_emitter_(MakeBodyEmitterForMultiOutputFusion(
           target_element_generator,
-          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()),
-          ir_builder)),
+          std::vector<IrArray>(target_arrays.begin(), target_arrays.end()), b)),
       shape_(target_arrays[0].GetShape()),
-      ir_builder_(ir_builder) {
+      b_(b) {
   // Sanity check: In multi-output fusion, all shapes produced must have the
   // same dimensions.
   for (const IrArray& array : target_arrays) {
@@ -102,7 +98,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // Loops are added from outermost to innermost order with the ForLoopNest
   // class so emit loops in order from most-major dimension down to most-minor
   // dimension (of the target shape).
-  ForLoopNest loop_nest(loop_name, ir_builder_);
+  ForLoopNest loop_nest(loop_name, b_);
   IrArray::Index array_index(index_type, shape_.dimensions_size());
   for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
     int64 dimension = LayoutUtil::Major(shape_.layout(), i);
@@ -116,8 +112,8 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   // Set IR builder insertion point to the loop body basic block of the
   // innermost loop.
   llvm::BasicBlock* innermost_body_bb = loop_nest.GetInnerLoopBodyBasicBlock();
-  ir_builder_->SetInsertPoint(innermost_body_bb,
-                              innermost_body_bb->getFirstInsertionPt());
+  b_->SetInsertPoint(innermost_body_bb,
+                     innermost_body_bb->getFirstInsertionPt());
 
   // Set exit_bb_ to the exit block of the loop nest.
   exit_bb_ = loop_nest.GetOuterLoopExitBasicBlock();
@@ -129,7 +125,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
 Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name,
                              llvm::Type* index_type) {
   if (index_type == nullptr) {
-    index_type = ir_builder_->getInt64Ty();
+    index_type = b_->getInt64Ty();
   }
 
   for (const IrArray::Index& array_index :
@@ -137,10 +133,10 @@ Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name,
     TF_RETURN_IF_ERROR(body_emitter_(array_index));
   }
 
-  // Set the insertion point of ir_builder_ to the loop exit, so that
+  // Set the insertion point of b_ to the loop exit, so that
   // code emitted for later instructions will be correctly placed.
   if (exit_bb_ != nullptr) {
-    ir_builder_->SetInsertPoint(exit_bb_);
+    b_->SetInsertPoint(exit_bb_);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 6be1c2fba2cbd78a02865901ef8c5b7e2b2a74e6..c4f5c82086ccfa233e0be118b1de10cce55a51b1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -41,11 +41,11 @@ class LoopEmitter {
   using BodyEmitter = std::function<Status(const IrArray::Index& index)>;
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
-              llvm::IRBuilder<>* ir_builder);
+              llvm::IRBuilder<>* b);
   // Constructs a LoopEmitter from an element generator that generates each
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
-              const IrArray& target_array, llvm::IRBuilder<>* ir_builder);
+              const IrArray& target_array, llvm::IRBuilder<>* b);
 
   // Constructs a LoopEmitter that emits one element into each of N separate
   // arrays on each iteration of the loop.
@@ -54,7 +54,7 @@ class LoopEmitter {
   // produce an LLVM struct with N elements.
   LoopEmitter(const ElementGenerator& target_element_generator,
               tensorflow::gtl::ArraySlice<IrArray> target_arrays,
-              llvm::IRBuilder<>* ir_builder);
+              llvm::IRBuilder<>* b);
 
   LoopEmitter(const LoopEmitter&) = delete;
   LoopEmitter& operator=(const LoopEmitter&) = delete;
@@ -65,8 +65,7 @@ class LoopEmitter {
   // specifies the element, will return multiple indices if the loop is
   // unrolled.
   std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
-    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"",
-                                         ir_builder_->getInt64Ty());
+    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"", b_->getInt64Ty());
   }
 
   virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
@@ -87,7 +86,7 @@ class LoopEmitter {
   // scalar, no loops are emitted and exit_bb_ is nullptr in that case.
   llvm::BasicBlock* exit_bb_;
 
-  llvm::IRBuilder<>* ir_builder_;
+  llvm::IRBuilder<>* b_;
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e115cdabf4b290617700276dba8f2e5648a7c07
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/math_ops.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+
+namespace xla {
+namespace llvm_ir {
+
+llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input) {
+  llvm::Type* type = input->getType();
+
+  // Clamp the input to [-9, 9].
+  llvm::Value* input_clamped = llvm_ir::EmitFloatMin(
+      llvm_ir::EmitFloatMax(input, llvm::ConstantFP::get(type, -9.0), b),
+      llvm::ConstantFP::get(type, 9.0), b);
+
+  static constexpr std::array<float, 7> numerator_coeffs{
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f};
+
+  static constexpr std::array<float, 4> denominator_coeffs{
+      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+      4.89352518554385e-03f};
+
+  llvm::Value* input_squared = b->CreateFMul(input_clamped, input_clamped);
+  llvm::Value* numerator = llvm::ConstantFP::get(type, numerator_coeffs[0]);
+  for (int i = 1; i < numerator_coeffs.size(); i++) {
+    numerator = b->CreateFAdd(b->CreateFMul(input_squared, numerator),
+                              llvm::ConstantFP::get(type, numerator_coeffs[i]));
+  }
+
+  numerator = b->CreateFMul(input_clamped, numerator);
+
+  llvm::Value* denominator = llvm::ConstantFP::get(type, denominator_coeffs[0]);
+  for (int i = 1; i < denominator_coeffs.size(); i++) {
+    denominator =
+        b->CreateFAdd(b->CreateFMul(input_squared, denominator),
+                      llvm::ConstantFP::get(type, denominator_coeffs[i]));
+  }
+
+  return b->CreateFDiv(numerator, denominator);
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pool_test.cc b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h
similarity index 54%
rename from tensorflow/compiler/xla/service/pool_test.cc
rename to tensorflow/compiler/xla/service/llvm_ir/math_ops.h
index 8c4fe258e38fff1b2086d8809bfc487e11ef713f..6c8bc3a076367eae2f1829966be2872e5f258178 100644
--- a/tensorflow/compiler/xla/service/pool_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/math_ops.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,28 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/pool.h"
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
 
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
 
 namespace xla {
-namespace {
+namespace llvm_ir {
 
-using PoolTest = ::testing::Test;
+// Emits an approximation of tanh. The implementation uses the same rational
+// interpolant as implemented in Eigen3.
+llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input);
 
-TEST_F(PoolTest, Test) {
-  Pool<int> pool;
-
-  {
-    auto ptr = pool.Allocate();
-    EXPECT_NE(nullptr, ptr.get());
-    *ptr = 5;
-  }
-
-  auto ptr = pool.Allocate();
-  EXPECT_NE(nullptr, ptr.get());
-  EXPECT_EQ(5, *ptr);
-}
-
-}  // namespace
+}  // namespace llvm_ir
 }  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_MATH_OPS_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5187948e294334b98d389607437bdf545923da0e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
+
+// IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+
+namespace {
+// Adds the inner comparison loop where we compare elements pointed to by
+// 'keys_index' and 'compare_keys_index'.
+void EmitCompareLoop(int64 dimension_to_sort, const IrArray::Index& keys_index,
+                     const IrArray::Index& compare_keys_index,
+                     const IrArray& keys_array,
+                     const tensorflow::gtl::optional<IrArray>& values_array,
+                     llvm::IRBuilder<>* b) {
+  // if (is_smaller_index &&
+  //     compare_keys[dimension_to_sort] < dimension_to_sort_bound)
+  llvm::Value* is_smaller_index = b->CreateICmpSLT(
+      keys_index[dimension_to_sort], compare_keys_index[dimension_to_sort]);
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  auto if_data = EmitIfThenElse(
+      b->CreateAnd(is_smaller_index,
+                   b->CreateICmpSLT(compare_keys_index[dimension_to_sort],
+                                    keys_index.GetConstantWithIndexType(
+                                        dimension_to_sort_bound))),
+      "smaller_comparison_index", b, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_data.true_block, b);
+  auto key1 = keys_array.EmitReadArrayElement(keys_index, b);
+  auto key2 = keys_array.EmitReadArrayElement(compare_keys_index, b);
+  auto key_type = keys_array.GetShape().element_type();
+  auto comparison =
+      primitive_util::IsFloatingPointType(key_type)
+          // TODO(b/26783907): Figure out how to handle NaNs.
+          ? b->CreateFCmp(llvm::FCmpInst::FCMP_ULT, key2, key1)
+          : b->CreateICmp(primitive_util::IsSignedIntegralType(key_type)
+                              ? llvm::ICmpInst::ICMP_SLT
+                              : llvm::ICmpInst::ICMP_ULT,
+                          key2, key1);
+  // If key2 < key1
+  auto if_smaller_data =
+      EmitIfThenElse(comparison, "is_smaller_than", b, /*emit_else=*/false);
+  SetToFirstInsertPoint(if_smaller_data.true_block, b);
+  // Swap key1 with key2.
+  keys_array.EmitWriteArrayElement(keys_index, key2, b);
+  keys_array.EmitWriteArrayElement(compare_keys_index, key1, b);
+  if (values_array.has_value()) {
+    // Also swap the values.
+    auto value1 = values_array.value().EmitReadArrayElement(keys_index, b);
+    auto value2 =
+        values_array.value().EmitReadArrayElement(compare_keys_index, b);
+    values_array.value().EmitWriteArrayElement(keys_index, value2, b);
+    values_array.value().EmitWriteArrayElement(compare_keys_index, value1, b);
+  }
+}
+}  // namespace
+
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       const tensorflow::gtl::optional<IrArray>& values_array,
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions) {
+  const Shape& keys_shape = keys_array.GetShape();
+
+  // TODO(b/26783907): This case can probably be avoided with the Algebraic
+  // Simplifier.
+  if (ShapeUtil::IsScalar(keys_shape)) {
+    return Status::OK();
+  }
+
+  // Create loop nests which loop through the operand dimensions. The sort
+  // dimension is handled in the innermost loop which performs the sorting.
+  ForLoopNest loop_nest(name, b);
+  IrArray::Index keys_index =
+      loop_nest.EmitOperandArrayLoopNest(keys_array, dimension_to_sort, "keys");
+  if (loop_nest.GetInnerLoopBodyBasicBlock() != nullptr) {
+    SetToFirstInsertPoint(loop_nest.GetInnerLoopBodyBasicBlock(), b);
+  }
+
+  // 'compare_keys_index' is the index of the element that 'keys_index' should
+  // be compared to.
+  IrArray::Index compare_keys_index(keys_index.GetType());
+  for (size_t dimension = 0; dimension < keys_index.size(); ++dimension) {
+    if (dimension != dimension_to_sort) {
+      compare_keys_index.push_back(keys_index[dimension]);
+    } else {
+      compare_keys_index.push_back(nullptr);
+    }
+  }
+
+  // Naive C++ code for the inner compare loop:
+  //
+  // for (int64 i = 0; i < dimension_to_sort_bound; ++i) {
+  //   int64 j = i ^ xor_mask;
+  //   if (i < j && j < dimension_to_sort_bound) {
+  //     int64 min_key = std::min(keys[i], keys[j]);
+  //     keys[j] = std::max(keys[i], keys[j]);
+  //     keys[i] = min_key;
+  //   }
+  // }
+  //
+  // This follows the algorithm described on Wikipedia:
+  // https://en.wikipedia.org/wiki/Bitonic_sorter
+
+  int64 dimension_to_sort_bound =
+      keys_array.GetShape().dimensions(dimension_to_sort);
+  Shape compare_shape = ShapeUtil::MakeShape(keys_shape.element_type(),
+                                             {dimension_to_sort_bound});
+  auto compare_loop_body_emitter =
+      [&](const IrArray::Index& compare_index) -> Status {
+    keys_index[dimension_to_sort] = compare_index[0];
+    compare_keys_index[dimension_to_sort] =
+        b->CreateXor(compare_index[0], xor_mask);
+    EmitCompareLoop(dimension_to_sort, keys_index, compare_keys_index,
+                    keys_array, values_array, b);
+    return Status::OK();
+  };
+  if (launch_dimensions != nullptr) {
+    TF_RETURN_IF_ERROR(gpu::ParallelLoopEmitter(compare_loop_body_emitter,
+                                                compare_shape,
+                                                *launch_dimensions, b)
+                           .EmitLoop(name));
+  } else {
+    TF_RETURN_IF_ERROR(LoopEmitter(compare_loop_body_emitter, compare_shape, b)
+                           .EmitLoop(name));
+  }
+
+  // Set the IR builder insert point to the exit basic block of the outer most
+  // loop. This ensures later instructions are inserted after this loop nest.
+  b->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
+
+  return Status::OK();
+}
+
+}  // namespace llvm_ir
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..8458744c6bc0e50a1c1cc8d3e66e29c7d4f74d73
--- /dev/null
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
+
+#include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace llvm_ir {
+// Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
+// dimension of 'keys_array'. All other dimensions are kept as-is. This
+// implements the inner loop of BitonicSort. If 'launch_dimensions' is nullptr,
+// the inner compare loop will not be parallelized.
+Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
+                       const tensorflow::gtl::optional<IrArray>& values_array,
+                       tensorflow::StringPiece name, llvm::Value* xor_mask,
+                       llvm::IRBuilder<>* b,
+                       const gpu::LaunchDimensions* launch_dimensions);
+}  // namespace llvm_ir
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LLVM_IR_SORT_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index 5fc08aab916e377b245b6221108956c06da70767..11ed6ee59f1bf8e7004b8bef7319b37ef41a304c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -31,12 +31,12 @@ namespace llvm_ir {
 
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
+                     llvm::IRBuilder<>* b, llvm::Module* module) {
   CHECK(ShapeUtil::IsScalar(pred.GetShape()));
 
   llvm::LoadInst* pred_value =
-      ir_builder->CreateLoad(pred.GetBasePointer(), "load_predicate_value");
-  llvm::Value* pred_cond = ir_builder->CreateICmpNE(
+      b->CreateLoad(pred.GetBasePointer(), "load_predicate_value");
+  llvm::Value* pred_cond = b->CreateICmpNE(
       pred_value,
       llvm::ConstantInt::get(PrimitiveTypeToIrType(PRED, module), 0),
       "boolean_predicate");
@@ -46,47 +46,42 @@ void EmitTupleSelect(const IrArray& select, const IrArray& pred,
   VLOG(2) << "  pred_cond: " << DumpToString(*pred_cond);
 
   for (int i = 0; i < ShapeUtil::TupleElementCount(select.GetShape()); ++i) {
-    llvm::Value* const element_index[] = {ir_builder->getInt64(0),
-                                          ir_builder->getInt64(i)};
+    llvm::Value* const element_index[] = {b->getInt64(0), b->getInt64(i)};
     llvm::Value* on_true_element_address =
-        ir_builder->CreateInBoundsGEP(on_true, element_index);
-    llvm::Value* on_true_element = ir_builder->CreateLoad(
+        b->CreateInBoundsGEP(on_true, element_index);
+    llvm::Value* on_true_element = b->CreateLoad(
         on_true_element_address, "on_true_element_" + llvm::Twine(i));
     llvm::Value* on_false_element_address =
-        ir_builder->CreateInBoundsGEP(on_false, element_index);
-    llvm::Value* on_false_element = ir_builder->CreateLoad(
+        b->CreateInBoundsGEP(on_false, element_index);
+    llvm::Value* on_false_element = b->CreateLoad(
         on_false_element_address, "on_false_element_" + llvm::Twine(i));
 
     llvm::Value* output_element_address =
-        ir_builder->CreateInBoundsGEP(select.GetBasePointer(), element_index);
-    ir_builder->CreateStore(
-        ir_builder->CreateSelect(pred_cond, on_true_element, on_false_element,
-                                 "select_output_element_" + llvm::Twine(i)),
-        output_element_address);
+        b->CreateInBoundsGEP(select.GetBasePointer(), element_index);
+    b->CreateStore(b->CreateSelect(pred_cond, on_true_element, on_false_element,
+                                   "select_output_element_" + llvm::Twine(i)),
+                   output_element_address);
   }
 }
 
 void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
-               llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
+               llvm::IRBuilder<>* b, llvm::Module* module) {
   for (size_t i = 0; i < operands.size(); ++i) {
-    auto* store = ir_builder->CreateStore(
-        ir_builder->CreatePointerCast(operands[i],
-                                      PrimitiveTypeToIrType(TUPLE, module)),
-        ir_builder->CreateInBoundsGEP(
-            tuple.GetBasePointer(),
-            {ir_builder->getInt64(0), ir_builder->getInt64(i)}));
+    auto* store = b->CreateStore(
+        b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module)),
+        b->CreateInBoundsGEP(tuple.GetBasePointer(),
+                             {b->getInt64(0), b->getInt64(i)}));
     tuple.AnnotateLoadStoreInstructionWithMetadata(store);
   }
 }
 
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* ir_builder,
-                                 llvm::Module* module) {
-  llvm::Value* element_ptr = ir_builder->CreateInBoundsGEP(
-      operand, {ir_builder->getInt64(0), ir_builder->getInt64(index)});
-  llvm::LoadInst* src_buffer = ir_builder->CreateLoad(element_ptr);
+                                 llvm::IRBuilder<>* b, llvm::Module* module) {
+  llvm::Value* element_ptr =
+      b->CreateInBoundsGEP(operand, {b->getInt64(0), b->getInt64(index)});
+  llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
 
   // Mark the loaded pointer as dereferenceable if we know its shape.
   if (!ShapeUtil::IsOpaque(target_shape)) {
@@ -98,7 +93,7 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
 
   llvm::Type* element_type = ShapeToIrType(target_shape, module);
   llvm::Value* ret_val =
-      ir_builder->CreateBitCast(src_buffer, element_type->getPointerTo());
+      b->CreateBitCast(src_buffer, element_type->getPointerTo());
   return ret_val;
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
index 352d34ebf839c6c2465abade7c3d3eb3b7a34506..cf6bf5d0b14ba71cbed67f9a1dc728c0eef5e393 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
@@ -61,13 +61,13 @@ namespace llvm_ir {
 //   output[i] = pred ? tuple_on_true[i] : tuple_on_false[i]
 void EmitTupleSelect(const IrArray& select, const IrArray& pred,
                      llvm::Value* on_true, llvm::Value* on_false,
-                     llvm::IRBuilder<>* ir_builder, llvm::Module* module);
+                     llvm::IRBuilder<>* b, llvm::Module* module);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand.
 void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
-               llvm::IRBuilder<>* ir_builder, llvm::Module* module);
+               llvm::IRBuilder<>* b, llvm::Module* module);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand. A GetTupleElement instruction
@@ -75,8 +75,7 @@ void EmitTuple(const IrArray& tuple,
 // Returns an llvm value representing a pointer to the tuple element buffer.
 llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64 index,
                                  int alignment, llvm::Value* operand,
-                                 llvm::IRBuilder<>* ir_builder,
-                                 llvm::Module* module);
+                                 llvm::IRBuilder<>* b, llvm::Module* module);
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 53efc30c3653879709fceae3dcdd4f679740f622..5e02096ee501b23a7976a50f13bb7e7f3c5e2d34 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 39d6734c3fc06df6832cf67edddbc7c14c815cd1..8f707ea9046a00a15cac469672a7a992f20bf483 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
diff --git a/tensorflow/compiler/xla/service/pool.h b/tensorflow/compiler/xla/service/pool.h
deleted file mode 100644
index 8e710ebb6dc17e0e204ba6ab3c6c159627cd9d3b..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/service/pool.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_POOL_H_
-#define TENSORFLOW_COMPILER_XLA_POOL_H_
-
-#include <functional>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace xla {
-
-// Pool of values, which are created as needed and destroyed when the `Pool` is
-// destroyed
-template <typename T>
-class Pool {
- public:
-  struct Deleter {
-    void operator()(T* ptr) { pool->Deallocate(ptr); }
-
-    Pool<T>* pool;
-  };
-
-  // A pointer to a taken element of a `Pool` which returns it to the pool on
-  // destruction
-  using SmartPtr = std::unique_ptr<T, Deleter>;
-
-  // Constructs a `Pool` with given factory function, which need not be
-  // thread-safe.
-  explicit Pool(std::function<std::unique_ptr<T>()> factory)
-      : factory_(factory) {}
-
-  explicit Pool() : Pool([]() { return MakeUnique<T>(); }) {}
-
-  // Returns a pointer to a value in the pool, creating a new value if none is
-  // free. The returned smart pointer returns the element to the pool on
-  // destruction.
-  //
-  // This method is thread-safe.
-  SmartPtr Allocate() {
-    tensorflow::mutex_lock lock(mu_);
-    T* ptr;
-    if (!xs_.empty()) {
-      ptr = std::move(xs_.back()).release();
-      xs_.pop_back();
-    } else {
-      ptr = factory_().release();
-    }
-    Deleter del = {this};
-    return std::unique_ptr<T, Deleter>(ptr, del);
-  }
-
- private:
-  // Puts a pointer to a value back into the pool, leaving it free for future
-  // use.
-  //
-  // This method is thread-safe.
-  void Deallocate(T* ptr) {
-    tensorflow::mutex_lock lock(mu_);
-    xs_.push_back(std::unique_ptr<T>(ptr));
-  }
-
-  const std::function<std::unique_ptr<T>()> factory_ GUARDED_BY(mu_);
-  std::vector<std::unique_ptr<T>> xs_ GUARDED_BY(mu_);
-  tensorflow::mutex mu_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_POOL_H_
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 636013cbb561f8506e173bd634e07b48a8dc570e..ce070bc5b6c3dfc22ffd0922be27f0afd6bff48f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -376,7 +377,7 @@ Service::ExecuteParallelAndRegisterResult(
     ExecutionProfile* profile) {
   // Streams where the computation are launched, so we can wait on the streams
   // to complete.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
+  std::vector<StreamPool::Ptr> streams;
   std::vector<std::unique_ptr<se::Timer>> timers;
 
   // Global data handles for the computation results, one for each computation.
@@ -403,7 +404,7 @@ Service::ExecuteParallelAndRegisterResult(
     CHECK_EQ(replicas.size(), arguments[i].size());
     std::vector<ScopedShapedBuffer> result_buffers;
     for (int64 replica = 0; replica < replicas.size(); ++replica) {
-      TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+      TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
                           backend->BorrowStream(replicas[replica]));
       streams.push_back(std::move(stream));
 
@@ -515,13 +516,13 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
         arguments,
     Backend* backend, const string& result_tag, ExecutionProfile* profile) {
   // Set up streams.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
+  std::vector<StreamPool::Ptr> streams;
 
   TF_ASSIGN_OR_RETURN(auto replicas,
                       Replicas(*backend, SingleComputationDeviceHandle()));
   TF_RET_CHECK(!replicas.empty());
   for (se::StreamExecutor* executor : replicas) {
-    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
+    TF_ASSIGN_OR_RETURN(StreamPool::Ptr stream,
                         backend->BorrowStream(executor));
     streams.push_back(std::move(stream));
   }
@@ -533,7 +534,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
 
   // Set up run options.
   std::vector<ServiceExecutableRunOptions> run_options;
-  for (const Pool<se::Stream>::SmartPtr& stream : streams) {
+  for (const StreamPool::Ptr& stream : streams) {
     ExecutableRunOptions options;
     options.set_stream(stream.get());
     options.set_device_ordinal(stream->parent()->device_ordinal());
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index 7f3910cdb0366078b97fb5f6a2dc498b37570926..dbfed628bfcabffe66bef41a82e0e2430897d80d 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
 
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/service/pool.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
@@ -27,8 +27,7 @@ namespace xla {
 // data, now only a stream cache for GPU backend.
 class ServiceExecutableRunOptions {
  public:
-  using StreamBorrower =
-      std::function<StatusOr<Pool<se::Stream>::SmartPtr>(int)>;
+  using StreamBorrower = std::function<StatusOr<StreamPool::Ptr>(int)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -51,7 +50,7 @@ class ServiceExecutableRunOptions {
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<Pool<se::Stream>::SmartPtr> BorrowStream(int device_ordinal) const {
+  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal) const {
     return borrow_stream_
                ? borrow_stream_(device_ordinal)
                : Status(tensorflow::error::UNIMPLEMENTED, "No stream cache");
diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc
new file mode 100644
index 0000000000000000000000000000000000000000..92bb21b816c36df4dee266942a7ce51718efdfd1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stream_pool.h"
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+
+namespace xla {
+
+StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) {
+  std::unique_ptr<se::Stream> stream;
+  {
+    tensorflow::mutex_lock lock(mu_);
+    if (!streams_.empty()) {
+      // Re-use an existing stream from the pool.
+      stream = std::move(streams_.back());
+      streams_.pop_back();
+    }
+  }
+
+  if (!stream) {
+    // Create a new stream.
+    stream = MakeUnique<se::Stream>(executor);
+    stream->Init();
+  }
+
+  // Return the stream wrapped in Ptr, which has our special deleter semantics.
+  PtrDeleter deleter = {this};
+  return Ptr(stream.release(), deleter);
+}
+
+void StreamPool::ReturnStream(se::Stream* stream) {
+  if (stream->ok()) {
+    tensorflow::mutex_lock lock(mu_);
+    streams_.emplace_back(stream);
+  } else {
+    // If the stream has encountered any errors, all subsequent
+    // operations on it will fail. So just delete the stream, and rely
+    // on new streams to be created in the future.
+    delete stream;
+  }
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/stream_pool.h b/tensorflow/compiler/xla/service/stream_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..7221d323a61593ac4b203a81b6046d81a5beaaf0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool.h
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// Pool of stream_executor::Streams, which are created as needed and
+// destroyed when the pool is destroyed.
+class StreamPool {
+ public:
+  struct PtrDeleter {
+    void operator()(se::Stream* stream) { pool->ReturnStream(stream); }
+    StreamPool* pool;
+  };
+
+  // Stream pointer type returned by BorrowStream, which returns the
+  // stream to the pool on destruction.
+  using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
+
+  StreamPool() {}
+
+  // Returns a pointer to a stream in the pool, creating a new stream
+  // if none are available in the pool. The returned smart pointer
+  // returns the stream to the pool on destruction.
+  //
+  // This method is thread-safe.
+  Ptr BorrowStream(se::StreamExecutor* executor);
+
+ private:
+  // Puts a pointer to a stream back into the pool, leaving it free
+  // for future use. Streams that have previously encountered errors
+  // are deleted, and not returned to the pool.
+  //
+  // This method is thread-safe.
+  void ReturnStream(se::Stream* stream);
+
+  tensorflow::mutex mu_;
+  std::vector<std::unique_ptr<se::Stream>> streams_ GUARDED_BY(mu_);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
diff --git a/tensorflow/compiler/xla/service/stream_pool_test.cc b/tensorflow/compiler/xla/service/stream_pool_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aaf5c37b0d250f78cb57639255ac9b59e1b462f7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/stream_pool_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/stream_pool.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+namespace {
+
+class StreamPoolTest : public ::testing::Test {
+ protected:
+  std::unique_ptr<se::StreamExecutor> NewStreamExecutor() {
+    se::Platform* platform =
+        se::MultiPlatformManager::PlatformWithName("Host").ConsumeValueOrDie();
+    se::StreamExecutorConfig config(/*ordinal=*/0);
+    return platform->GetUncachedExecutor(config).ConsumeValueOrDie();
+  }
+};
+
+TEST_F(StreamPoolTest, EmptyPool) { StreamPool pool; }
+
+TEST_F(StreamPoolTest, OneStreamPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow and return a stream.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  se::Stream* stream1_ptr = stream1.get();
+  EXPECT_TRUE(stream1->ok());
+  stream1 = nullptr;
+
+  // Borrow and return another stream.
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+  stream2 = nullptr;
+
+  // The underlying streams should be the same, since stream1 was the
+  // only stream available in the pool when stream2 was borrowed.
+  EXPECT_EQ(stream1_ptr, stream2_ptr);
+}
+
+TEST_F(StreamPoolTest, TwoStreamPool) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow two streams.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  se::Stream* stream1_ptr = stream1.get();
+  EXPECT_TRUE(stream1->ok());
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+
+  // The underlying streams should be different, since we haven't
+  // returned either of them yet.
+  EXPECT_NE(stream1_ptr, stream2_ptr);
+
+  // Return stream1 and borrow stream3.
+  stream1 = nullptr;
+  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  se::Stream* stream3_ptr = stream3.get();
+  EXPECT_TRUE(stream3->ok());
+
+  // stream1 and stream3 should be the same.
+  EXPECT_EQ(stream1_ptr, stream3_ptr);
+  EXPECT_NE(stream2_ptr, stream3_ptr);
+
+  // Return stream2, and borrow stream4.
+  stream2 = nullptr;
+  StreamPool::Ptr stream4 = pool.BorrowStream(executor.get());
+  se::Stream* stream4_ptr = stream4.get();
+  EXPECT_TRUE(stream4->ok());
+
+  // Stream2 and stream4 should be the same.
+  EXPECT_EQ(stream2_ptr, stream4_ptr);
+  EXPECT_NE(stream3_ptr, stream4_ptr);
+}
+
+TEST_F(StreamPoolTest, BadStreamDiscarded) {
+  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
+  StreamPool pool;
+
+  // Borrow a stream.
+  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
+  EXPECT_TRUE(stream1->ok());
+
+  // Force an error on the stream; here we call a method that requires
+  // DNN support, which we know the Host platform doesn't support.
+  stream1->ThenDepthConcatenate({}, {}, nullptr);
+  EXPECT_FALSE(stream1->ok());
+
+  // Return stream1 and borrow stream2.
+  stream1 = nullptr;
+  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
+  se::Stream* stream2_ptr = stream2.get();
+  EXPECT_TRUE(stream2->ok());
+
+  // The underlying streams should be different. They would have been
+  // the same, but since we forced an error on stream1, it cannot be
+  // put back into the pool. Sadly we can't just check:
+  //    EXPECT_NE(stream1_ptr, stream2_ptr);
+  //
+  // The above should hold logically, but it may fail if the new
+  // stream instance allocated for stream2 happens to reside in the
+  // same memory address as stream1, which has been deleted.
+  //
+  // The check that stream2->ok() serves as a good-enough check.
+
+  // Return stream2 and borrow stream3. The previous error on stream1
+  // has no effect on these streams, and they are the same.
+  stream2 = nullptr;
+  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
+  se::Stream* stream3_ptr = stream3.get();
+  EXPECT_TRUE(stream3->ok());
+  EXPECT_EQ(stream2_ptr, stream3_ptr);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 7051a4cf51749d294478cf9a34d4700cb52ae312..58f767e913fbc0023e0c45a4f0e82ecefeeef2d6 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
index 10fc4958fae06414dbe7a3a0a798cb5c6e0f35c2..62af45128ad2fb7bf886bef78ec3ab42529a181e 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -61,6 +61,12 @@ StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
        WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
     int64 index = invariant_gte->tuple_index();
     const HloInstruction& invariant_value = *init_value.operand(index);
+
+    // Should have at least one user that's not while_body_root.
+    if (invariant_gte->user_count() <= 1) {
+      continue;
+    }
+
     if (invariant_value.opcode() == HloOpcode::kConstant) {
       auto* constant_instr =
           while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk"));
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 393e75803888d8a642881c4d525b170d1e1180ba..266039d2ff8ef4befba0d1023ac1914737207d4f 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -196,5 +196,50 @@ ENTRY entry {
                         op::GetTupleElement(op::Parameter(0)),
                         op::GetTupleElement(op::Parameter(0))));
 }
+
+TEST_F(WhileLoopConstantSinkingTest, DontCreateDeadConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[2],f32[2]) parameter(0)
+  p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
+  p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
+
+  outfeed = token[] outfeed(p_body.0)
+  ROOT root = (f32[2],f32[2],f32[2]) tuple(p_body.0, p_body.1, p_body.1)
+}
+
+condition {
+  p_cond = (f32[2],f32[2]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] constant({1, 2})
+  const_1 = f32[2] constant({2, 1})
+  while_init = (f32[2],f32[2]) tuple(const_0, const_1)
+  ROOT while = (f32[2],f32[2],f32[2]) while(while_init), condition=condition,
+                                      body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::GetTupleElement(), op::GetTupleElement(),
+                        op::GetTupleElement()));
+  for (const HloInstruction* inst : while_body->instructions()) {
+    if (inst->opcode() == HloOpcode::kConstant) {
+      EXPECT_GT(inst->user_count(), 0);
+    }
+  }
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index f4668c0f559acd7b1301499aaf71d5c6925424b3..ec901af1e2057449452c4c65243593b016a26f61 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -682,7 +682,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                            CompatibleIgnoringElementType);
   } else {
     // Opaque, token, etc types are vacuously compatible.
-    return true;
+    return lhs.element_type() == rhs.element_type();
   }
 }
 
@@ -697,7 +697,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
                            CompatibleIgnoringFpPrecision);
   } else {
     // Opaque, token, etc types are vacuously compatible.
-    return true;
+    return lhs.element_type() == rhs.element_type();
   }
 }
 
@@ -883,40 +883,51 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   }
 
   int64 shape_size = [&shape]() {
-    int64 shape_size;
     if (LayoutUtil::IsSparseArray(shape)) {
-      shape_size = LayoutUtil::MaxSparseElements(shape.layout());
-      if (shape_size < 0) {
-        return shape_size;
+      int64 max_sparse_elements = LayoutUtil::MaxSparseElements(shape.layout());
+      if (max_sparse_elements < 0) {
+        return max_sparse_elements;
+      }
+      int64 sparse_elements_size = MultiplyWithoutOverflow(
+          max_sparse_elements, ByteSizeOfPrimitiveType(shape.element_type()));
+      if (sparse_elements_size < 0) {
+        return sparse_elements_size;
       }
-      shape_size = MultiplyWithoutOverflow(shape_size, ShapeUtil::Rank(shape));
-      if (shape_size < 0) {
-        return shape_size;
+      int64 sparse_indices_size =
+          MultiplyWithoutOverflow(max_sparse_elements, ShapeUtil::Rank(shape));
+      if (sparse_indices_size < 0) {
+        return sparse_indices_size;
       }
-      shape_size = MultiplyWithoutOverflow(shape_size, sizeof(int64));
-      if (shape_size < 0) {
-        return shape_size;
+      sparse_indices_size =
+          MultiplyWithoutOverflow(sparse_indices_size, sizeof(int64));
+      if (sparse_indices_size < 0) {
+        return sparse_indices_size;
+      }
+      // At this point, both sparse_indices_size and sparse_elements_size are
+      // non-negative, so we can easily check if adding them wraps.
+      if (static_cast<uint64>(sparse_elements_size) +
+              static_cast<uint64>(sparse_indices_size) >
+          INT64_MAX) {
+        return static_cast<int64>(-1);
       }
     }
 
-    shape_size = 1;
-
     // This is intentionally unconditional: even if the shape is sparse, we want
     // to verify the densified version has a reasonable size.
+    int64 dense_shape_size = 1;
     if (shape.dimensions().empty()) {
-      return shape_size;
+      return dense_shape_size;
     }
 
     for (int64 dim : shape.dimensions()) {
-      shape_size = MultiplyWithoutOverflow(shape_size, dim);
-      if (shape_size < 0) {
-        return shape_size;
+      dense_shape_size = MultiplyWithoutOverflow(dense_shape_size, dim);
+      if (dense_shape_size < 0) {
+        return dense_shape_size;
       }
     }
-    shape_size = MultiplyWithoutOverflow(
-        shape_size, ByteSizeOfPrimitiveType(shape.element_type()));
-
-    return shape_size;
+    dense_shape_size = MultiplyWithoutOverflow(
+        dense_shape_size, ByteSizeOfPrimitiveType(shape.element_type()));
+    return dense_shape_size;
   }();
 
   if (shape_size < 0) {
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index ed2d16c0e90685d6cb2603eba0a4cf880046aa18..e5dd62ae9a3dd9b961a7ae03a99c19220dbd43e7 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -334,6 +334,17 @@ TEST(ShapeUtilTest, IncompatibleScalarVsTuple) {
   EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1));
 }
 
+TEST(ShapeUtilTest, OpaqueVsArray) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {5, 7});
+  Shape shape2 = ShapeUtil::MakeOpaqueShape();
+  EXPECT_FALSE(ShapeUtil::Compatible(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::Compatible(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringFpPrecision(shape2, shape1));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape1, shape2));
+  EXPECT_FALSE(ShapeUtil::CompatibleIgnoringElementType(shape2, shape1));
+}
+
 TEST(ShapeUtilTest, CompareShapesWithPaddedDimensionsMismatch) {
   Shape shape1 = ShapeUtil::MakeShape(F32, {20, 30});
   shape1.mutable_layout()->add_padded_dimensions(10);
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 6a75aa6794e617e57e40b9a3cef74a9af74b91d3..42d52aee780e2aade0f2ed3597e653567b8da49b 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -154,8 +154,8 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
@@ -192,8 +192,8 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -262,7 +262,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -290,8 +290,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -314,8 +314,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -334,8 +334,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -356,9 +356,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -376,9 +376,10 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:platform_util",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
@@ -395,8 +396,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -419,9 +420,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -445,8 +446,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -464,9 +465,9 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
@@ -483,8 +484,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -501,8 +502,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -519,9 +520,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -543,8 +544,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -562,8 +563,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -586,8 +587,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -612,8 +613,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -638,7 +639,7 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -658,7 +659,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -681,8 +682,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -702,7 +703,7 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -725,8 +726,8 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -749,8 +750,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -773,8 +774,8 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -795,7 +796,7 @@ CONVOLUTION_TEST_DEPS = [
     "//tensorflow/compiler/xla/client:global_data",
     "//tensorflow/compiler/xla/client:local_client",
     "//tensorflow/compiler/xla/client:padding",
-    "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+    "//tensorflow/compiler/xla/client:xla_builder",
     "//tensorflow/compiler/xla/tests:client_library_test_base",
     "//tensorflow/compiler/xla/tests:literal_test_util",
     "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -838,8 +839,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -862,8 +863,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -891,10 +892,10 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -924,9 +925,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -950,8 +951,8 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -972,7 +973,7 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -991,8 +992,8 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1013,7 +1014,7 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
@@ -1044,8 +1045,8 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1065,9 +1066,9 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1096,9 +1097,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1123,9 +1124,9 @@ xla_test_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1164,9 +1165,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1187,7 +1188,7 @@ xla_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1241,8 +1242,8 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1260,7 +1261,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/cpu:custom_call_target_registry",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1283,8 +1284,8 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1306,8 +1307,8 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1327,8 +1328,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1345,8 +1346,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1362,8 +1363,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1387,8 +1388,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1408,8 +1409,8 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1438,8 +1439,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1458,7 +1459,7 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1481,9 +1482,9 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1507,8 +1508,8 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1527,8 +1528,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1549,8 +1550,8 @@ xla_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1572,7 +1573,7 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1593,8 +1594,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1612,8 +1613,8 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1635,8 +1636,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1656,8 +1657,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -1673,8 +1674,8 @@ xla_test(
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1687,8 +1688,8 @@ xla_test(
     deps = [
         ":client_library_test_base",
         "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
@@ -1708,8 +1709,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1793,8 +1794,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_runner",
@@ -1821,8 +1822,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1858,8 +1859,8 @@ xla_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1886,8 +1887,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -1903,6 +1904,16 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "outfeed_in_nested_computation_test",
+    srcs = ["outfeed_in_nested_computation_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/tests:local_client_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "hlo_metadata_test",
     srcs = [
@@ -1912,7 +1923,7 @@ tf_cc_test(
         ":local_client_test_base",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/core:test_main",
@@ -1954,8 +1965,8 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1968,7 +1979,7 @@ xla_test(
     name = "deep_graph_test",
     srcs = ["deep_graph_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2001,6 +2012,7 @@ xla_test(
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -2053,10 +2065,30 @@ xla_test(
         ":local_client_test_base",
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
 )
+
+xla_test(
+    name = "iota_test",
+    srcs = ["iota_test.cc"],
+    blacklisted_backends = [
+        "cpu",
+        "gpu",
+    ],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
+    deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 3ae96fa1bcb1057653a75db62def5556ae37f886..74f2e36f826cd82ce4015df857f3de67950beaeb 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/axpy_simple_test.cc b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
index 8d15b7841bc7298cd6865d8689cc496c0459e4b9..caeb0bf49a0dde9eeac02037b2ea04fd024d100c 100644
--- a/tensorflow/compiler/xla/tests/axpy_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/axpy_simple_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
index 8c227df7f04e79ccc332062d0889d282c0f5e40f..af0b8522394a0c591e6c42ad12db8853ef66243c 100644
--- a/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
+++ b/tensorflow/compiler/xla/tests/bad_rng_shape_validation_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/batch_normalization_test.cc b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
index 6a024798f9e3faa5164b4dce6f43e517f6ab8eca..d372d1ca434b1da416f671060f9461cf07aa5fc4 100644
--- a/tensorflow/compiler/xla/tests/batch_normalization_test.cc
+++ b/tensorflow/compiler/xla/tests/batch_normalization_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index 747c82b502c8ec9f8121641382d9fd3c9552b010..6c20f654fe3df6a28e9633cd832c11b487894bad 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/binop_scaling_test.cc b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
index 20cb989751ad69e2f3cf97c87c43293951f599ab..0d7a3aa46a9c12c19d954c11ae3a2cccbed886ef 100644
--- a/tensorflow/compiler/xla/tests/binop_scaling_test.cc
+++ b/tensorflow/compiler/xla/tests/binop_scaling_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
index d531e8fa82e47f7bcd278f10da2c205e44db0ac1..c6b5108fe9e5bcf843982676d822f1942359da71 100644
--- a/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
+++ b/tensorflow/compiler/xla/tests/bitcast_convert_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 50dd574624bb3874e682be5a272fb5bdefa4adc4..1d28e85b16596b0ec2717138fb2081878203e8b2 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 2086e38b91955b23ab11af73acd7faf46ca4bb18..b1d18210eaafdfec0920c0cccaa0dfdbd6de5609 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index 0bc8facfe2cfcfab094f483137f6d8e241c6aaf9..a4eb57fc7b9abd460a7d158d0dc629eba88018cd 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index ef784da457be608de63eac478ef61c3df8627036..59d917054be2ebe3a25f902f51972a682a5231b6 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -273,10 +273,16 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
+
+  // Transfer and use elements of arguments_, if the AddParam() API was used.
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      arguments.push_back(argument.get());
+      owning_arguments.push_back(
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
+              .ValueOrDie());
+      arguments.push_back(owning_arguments.back().get());
     }
   }
 
@@ -331,10 +337,16 @@ Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     ErrorSpec error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
                                      arguments_passed_in.end());
+
+  // Transfer and use elements of arguments_, if the AddParam() API was used.
+  std::vector<std::unique_ptr<GlobalData>> owning_arguments;
   if (!arguments_.empty()) {
     CHECK(arguments.empty());
     for (const auto& argument : arguments_) {
-      arguments.push_back(argument.get());
+      owning_arguments.push_back(
+          client_->TransferToServer(MaybeConvertLiteralToBfloat16(argument))
+              .ValueOrDie());
+      arguments.push_back(owning_arguments.back().get());
     }
   }
 
@@ -454,6 +466,14 @@ ClientLibraryTestBase::ComputeValueAndReference(
   // function.
   std::vector<std::unique_ptr<GlobalData>> argument_data;
   std::vector<std::unique_ptr<GlobalData>> ref_argument_data;
+
+  // Use `arguments_` if the AddParam() API was used.  Otherwise, use
+  // plain `arguments`.
+  if (!arguments_.empty()) {
+    CHECK_EQ(arguments.size(), 0);
+    arguments = arguments_;
+  }
+
   for (const auto& arg : arguments) {
     TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg.Clone()));
     TF_ASSIGN_OR_RETURN(auto ref_data, ref_client_->TransferToServer(arg));
@@ -552,10 +572,9 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
 
 XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
                                       XlaBuilder* builder) {
-  XlaOp data_handle;
-  arguments_.push_back(CreateParameterAndTransferLiteral(
-      arguments_.size(), argument, "", builder, &data_handle));
-  return data_handle;
+  arguments_.push_back(argument.Clone());
+  return Parameter(builder, /*parameter_number=*/arguments_.size() - 1,
+                   MaybeConvertShapeToBfloat16(argument.shape()), "");
 }
 
 XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
@@ -575,22 +594,39 @@ ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
                                            nullptr, builder, data_handle);
 }
 
+Shape ClientLibraryTestBase::MaybeConvertShapeToBfloat16(const Shape& shape) {
+  if (!use_bfloat16_) {
+    return shape;
+  }
+  Shape new_shape = shape;
+  ShapeUtil::ForEachMutableSubshape(&new_shape,
+                                    [](Shape* subshape, const ShapeIndex&) {
+                                      if (subshape->element_type() == F32) {
+                                        subshape->set_element_type(BF16);
+                                      }
+                                    });
+  return new_shape;
+}
+
+Literal ClientLibraryTestBase::MaybeConvertLiteralToBfloat16(
+    const Literal& literal) {
+  if (use_bfloat16_) {
+    return std::move(*LiteralUtil::ConvertF32ToBF16(literal));
+  }
+  return literal.Clone();
+}
+
 std::unique_ptr<GlobalData>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(
     int64 parameter_number, const Literal& literal, const string& name,
     const DeviceHandle* device_handle, XlaBuilder* builder,
     XlaOp* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
+  Literal param_literal = MaybeConvertLiteralToBfloat16(literal);
   std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
+      client_->TransferToServer(param_literal, device_handle)
           .ConsumeValueOrDie();
   *data_handle =
-      Parameter(builder, parameter_number, param_literal->shape(), name);
+      Parameter(builder, parameter_number, param_literal.shape(), name);
   return data;
 }
 
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index fcc9347db519c2fe5b7804d381c7b4d14a85b8cc..4a6e8a31241d39db21935576d57f0acb17caef11 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -26,8 +26,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -399,12 +399,16 @@ class ClientLibraryTestBase : public ::testing::Test {
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
+  // Converts an f32 shape/literal to bf16 if use_bfloat16_ is true.
+  Literal MaybeConvertLiteralToBfloat16(const Literal& literal);
+  Shape MaybeConvertShapeToBfloat16(const Shape& shape);
+
   // Whether to run tests with all float-type input/output converted to
   // bfloat16.
   bool use_bfloat16_ = false;
 
   // Arguments to be passed to the computation when it runs.
-  std::vector<std::unique_ptr<GlobalData>> arguments_;
+  std::vector<Literal> arguments_;
 };
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 6ce2f844a34b01cd07df97a9bb12842490838be6..c898dacf489db97223e2918414daf5de88bece64 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index ff3824628676df8a37f3a98742d9423fd42928e4..7c52c9fbbb57f9291ea9f0966e2efa715819fb67 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 64bf8b3b3878f74e0557afc520c9ae342bd07c4a..5a06d061f0d83fff547502495ff8ab13fb421b70 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9f288634c0fa7d7dffa7f9c1af3a0752996cdec2..be017477d84eb9faf5aa79dcdf54d6b6aaf6fd8e 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/tests/conditional_test.cc b/tensorflow/compiler/xla/tests/conditional_test.cc
index 369663de150964bd271b92d91dcd9276c87a763c..b27c1044baf2c0002f166c53a81e4361c60d012a 100644
--- a/tensorflow/compiler/xla/tests/conditional_test.cc
+++ b/tensorflow/compiler/xla/tests/conditional_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 71d72a9828c5445be2cb1f559cf31363507bcd8d..49375748319ad5fe40db507a034ec4b07adb7e84 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index dca57fd1c705da758ad0305b7799d6e806bbf72f..1adc68cc4839dcd7d89741ec016f27bc9047c9a5 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
 #include <limits>
 #include <memory>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -52,13 +53,67 @@ TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
+TEST_F(ConvertTest, ConvertR1S32ToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<int32>(&builder, {42, 64});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {42, 64};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1S32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<int32>(&builder, {42, 0, -64});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 64});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {42, 64};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1S32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 64});
+  ConvertElementType(a, S32);
+
+  std::vector<int32> expected = {42, 64};
+  ComputeAndCompareR1<int32>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1U32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<uint32>(&builder, {42, 0, 64});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
+}
+
 TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {42.0f, 64.0f});
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {42.0f, 64.0f};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
+}
+
+TEST_F(ConvertTest, ConvertR1F32ToR1PRED) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<float>(&builder, {42.0f, 0.0f, 64.0f});
+  ConvertElementType(a, PRED);
+
+  std::array<bool, 3> expected = {true, false, true};
+  ComputeAndCompareR1<bool>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
@@ -67,7 +122,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {42.0f, 64.0f};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
@@ -79,6 +134,15 @@ TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
   ComputeAndCompareR1<int32>(&builder, expected, {});
 }
 
+TEST_F(ConvertTest, ConvertR1PREDToR1U32) {
+  XlaBuilder builder(TestName());
+  auto a = ConstantR1<bool>(&builder, {true, false, true});
+  ConvertElementType(a, U32);
+
+  std::vector<uint32> expected = {1, 0, 1};
+  ComputeAndCompareR1<uint32>(&builder, expected, {});
+}
+
 TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<bool>(&builder, {true, false, true});
@@ -94,7 +158,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
   ConvertElementType(a, F32);
 
   std::vector<float> expected = {};
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
+  ComputeAndCompareR1<float>(&builder, expected, {});
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
diff --git a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
index 944366410b14439aa33999185525f1029735e95b..7b6bbc4f571af2e11306f95c24e243e78e0f4f4e 100644
--- a/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_dimension_numbers_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index a8b8f74ca9603a71acefc0be2141d7b9caf2b73b..5ed8122e0073bde77bb2507a0ddd89c4365627c9 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 8792e7781b17465d94ae8ac8375a4523f368d720..6784c16715da72d337edf70fa51db42c59404136 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index 1dc6ff0f4f51b51002cfb868a51457c08a259a80..5ef273e5a26ea8a16db864974c9bfa2c296cbce8 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
diff --git a/tensorflow/compiler/xla/tests/custom_call_test.cc b/tensorflow/compiler/xla/tests/custom_call_test.cc
index 90f3d1b874f4da09104dc066c6642db1d2e77997..13c777835eb2d2519d39205cdc96f0aac4850c7d 100644
--- a/tensorflow/compiler/xla/tests/custom_call_test.cc
+++ b/tensorflow/compiler/xla/tests/custom_call_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index d4b3aac85bff283515088f6e61c9d2bad11f60d3..5f234f36a8543ad408fb3430b27844beb16a54b5 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index a6a233e71aabc47c78ea291b71f8b831f1c60323..2db6503afab748d7b778e26b2f9350ac64c7778b 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc
index 810947ab01b69b10b6ae60c551bd7aba10a6313d..3f3e8ab712fea14be9e4a7015effdf8ce518309b 100644
--- a/tensorflow/compiler/xla/tests/deep_graph_test.cc
+++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index d86fd7cc2d4da10ed726ca11a6d9f86287a5d11e..cfd36abf47c0e510b41b4ce8dfba077f4119a6c2 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index 88ac96d6b0f9206ef1ed0e4135495d7903ebf3f4..7f6f203a1ba48e0053f799c58bbbeae87aef1f7f 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
index ebba13c5b39e241ff01c9ddcba8a1c04180b4bd0..5116e60ca63ef5f94b25b15e6616086fb9e44bbb 100644
--- a/tensorflow/compiler/xla/tests/execution_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index 86bfaea4ef43ad382e497fd281ec5439f001b56f..bf1de02ba9dbd97db9ee31484402fe9b92385219 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index 30dc639f117b9871238f0bf1628502cf8bef2e0c..39cc6c5927f1d416e31f689487efc10c20371abe 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index 0254ae1baaa864b38c3b217a5c2026d34b7f7d12..c5bbbe778df15d63a2586bd6291a7a33fc82aa52 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index dc6447793575c757fdd68baea03f416a951e45fc..792be0d3fcd55621b9f8cdf0fdc28f7bb49294d1 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -799,6 +799,46 @@ ENTRY main {
       *result));
 }
 
+class FusionClientLibraryTest : public ClientLibraryTestBase {};
+
+XLA_TEST_F(FusionClientLibraryTest, ManyLayoutTransformations) {
+  // On the GPU backend, it's possible to have too many transposes within one
+  // fusion, causing the kernel to run out shared memory and thus not compile.
+  // We want to check that doesn't happen.
+  //
+  // To do this, we create a computation that computes
+  //
+  //   P0 + P0*P1*P1 + P0*P2*P2 ...
+  //
+  // where even parameters have layout 1 and odd parameters have layout 2.
+  //
+  // Our goal is to tempt the backend into creating one giant multi-output
+  // fusion for the whole computation, including the transposes.  Currently
+  // multi-output fusion only fuses fusions, so each of the terms in the sum
+  // needs to be a fusion itself, thus the contortions above.
+  constexpr int kNumParams = 25;
+  XlaBuilder b("ManyLayoutTransformations");
+
+  // This test produces values that overflow int32, which is UB, so use uint32,
+  // where overflow is OK.
+  Array2D<uint32> arr(32, 32);
+  arr.FillUnique();
+  std::unique_ptr<Literal> l1 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout(
+      LayoutUtil::MakeLayout({0, 1}));
+
+  std::unique_ptr<Literal> l2 = LiteralUtil::CreateR2FromArray2D(arr)->Relayout(
+      LayoutUtil::MakeLayout({1, 0}));
+
+  XlaOp p0 = AddParam(*l1, &b);
+  XlaOp sum = p0;
+  for (int i = 1; i < kNumParams; ++i) {
+    auto pN = AddParam((i % 2 == 0 ? *l1 : *l2), &b);
+    sum = sum + p0 * pN * pN;
+  }
+
+  ComputeAndCompare(&b, {});
+}
+
 void BM_ParallelFusion(int num_iters) {
   // Simple element-wise computation to benchmark parallel task partitioning.
   tensorflow::testing::StopTiming();
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index c5ca64fa3f0fc70542c577828d53eeecbd05067b..b77bece85ad1b2192b04330af9e60d3a424b59f4 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index 73a47eda721971c75f61109787844c40be0b7080..51450314b611b49c643fb6fd5b0c0d2e7205a2d2 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -48,7 +48,8 @@ class UnaryOpTest : public HalfTestBase,
                     public ::testing::WithParamInterface<UnaryOpTestParam> {};
 
 XLA_TEST_P(UnaryOpTest, Ops) {
-  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
+  std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1), half(9.0),
+                       half(42.0), half(-9.0), half(-100.0)});
   XlaBuilder builder(TestName());
   XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
diff --git a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
index 4d82442f7e3630c115eff1f17544e2b892c5e7eb..5511190caf95544e2ac48d91c0a138db06a2544c 100644
--- a/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
+++ b/tensorflow/compiler/xla/tests/hlo_metadata_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f950aa1e8fe745075234a5ebff52d92be7378a5d
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <numeric>
+#include <vector>
+
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+namespace {
+
+class IotaTest : public ClientLibraryTestBase {
+ public:
+  explicit IotaTest(se::Platform* platform = nullptr)
+      : ClientLibraryTestBase(platform) {}
+  template <typename T>
+  std::vector<T> GetExpected(const int64 num_elements) {
+    std::vector<T> result(num_elements);
+    std::iota(result.begin(), result.end(), 0);
+    return result;
+  }
+};
+
+TEST_F(IotaTest, SimpleR1) {
+  for (int num_elements = 1; num_elements < 10000001; num_elements *= 10) {
+    {
+      XlaBuilder builder(TestName() + "_f32");
+      IotaGen(&builder, F32, num_elements);
+      ComputeAndCompareR1<float>(&builder, GetExpected<float>(num_elements), {},
+                                 ErrorSpec{0.0001});
+    }
+    {
+      XlaBuilder builder(TestName() + "_u32");
+      IotaGen(&builder, U32, num_elements);
+      ComputeAndCompareR1<uint32>(&builder, GetExpected<uint32>(num_elements),
+                                  {});
+    }
+    {
+      XlaBuilder builder(TestName() + "_s32");
+      IotaGen(&builder, S32, num_elements);
+      ComputeAndCompareR1<int32>(&builder, GetExpected<int32>(num_elements),
+                                 {});
+    }
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
index 0df50150aee69749beea79ff522fb6f820d1945d..e2cd5bcc5a95f692dcf4a43d717252bfe876aa81 100644
--- a/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_allocation_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 70612e7c49d2815096cc54fd6ae796148249b4db..74494e60e883417d5772ce71544715aef5ef3ef2 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -21,8 +21,8 @@ limitations under the License.
 
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -92,9 +92,10 @@ int main(int argc, char** argv) {
   // It's lame to hard-code the buffer assignments, but we need
   // local_client_aot_test.cc to be able to easily invoke the function.
   CHECK_EQ(result->result_buffer_index(), 1);
-  CHECK_EQ(result->buffer_sizes().size(), 2);
+  CHECK_EQ(result->buffer_sizes().size(), 3);
   CHECK_EQ(result->buffer_sizes()[0], -1);             // param buffer
   CHECK_EQ(result->buffer_sizes()[1], sizeof(float));  // result buffer
+  CHECK_EQ(result->buffer_sizes()[2], -1);             // const buffer
   if (triple.isOSBinFormatELF()) {
     // Check the ELF magic.
     CHECK_EQ(result->object_file_data()[0], 0x7F);
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 2f4d197ae632c08cb80b5d09ab4918f018e992ef..1a823cf189b310c62c735419936544ea99fcfbaf 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
@@ -772,6 +772,10 @@ XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
   ScopedShapedBuffer result =
       executable->Run({&x_array}, DefaultExecutableRunOptions())
           .ConsumeValueOrDie();
+  ASSERT_IS_OK(local_client_->mutable_backend()
+                   ->BorrowStream(0)
+                   .ValueOrDie()
+                   ->BlockHostUntilDone());
 
   LiteralTestUtil::ExpectR1Near<float>(
       {2.0f, 4.0f, 6.0f}, *ShapedBufferToLiteral(result), error_spec_);
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index 88797a7d0a7d0567b3a380c5fb1ad0c0ee875587..eaddf756dbc913dd9668cd22228fbd18c2c33309 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -189,7 +190,19 @@ StatusOr<ScopedShapedBuffer> LocalClientTestBase::ExecuteLocally(
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<LocalExecutable> executable,
       local_client_->Compile(computation, argument_layouts, build_options));
-  return executable->Run(arguments, run_options);
+  TF_ASSIGN_OR_RETURN(auto ret, executable->Run(arguments, run_options));
+
+  auto device_ordinal =
+      build_options.device_ordinal() == -1 ? 0 : build_options.device_ordinal();
+  auto* stream = run_options.stream();
+  if (!stream) {
+    stream = local_client_->mutable_backend()
+                 ->BorrowStream(device_ordinal)
+                 .ValueOrDie()
+                 .get();
+  }
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  return std::move(ret);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 258226523d830b40ecaa761df95988dc90f5ca47..b4477e9a6b23363ee3a1380f9f98f4b8226f6920 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index cdf70ee4185be2ecd9dcb2d21fbd98c2ab6cc0ad..2d622242e657ce032a17f7b26c94227d343e2a38 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 7ddc6369319810c0806afa161bc00f51caea2072..0732e195d44d738b264361e43d38259c26a4116e 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 069b8a881f4be0c05b19bb1f323bdc13c7222ceb..da8c42d465340f2af3d6acd2c3676b69512f193f 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
index e576f000ef23e761d6fa818457eec2144d4bcb00..955dbef6dcd28421fb351c6ee064ac53eda1fd08 100644
--- a/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
+++ b/tensorflow/compiler/xla/tests/multidimensional_slice_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cea7006526f0c56ade3cedead489ea12c0ab3922
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/outfeed_in_nested_computation_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/local_client_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+// Tests that ensure outfeed instructions that are contained in nested
+// computations in non-root positions are executed.
+
+class LocalClientExecuteTest : public LocalClientTestBase {};
+
+TEST_F(LocalClientExecuteTest, OutfeedInWhile) {
+  XlaBuilder b(TestName());
+
+  Shape state_tuple_array_shape = ShapeUtil::MakeShape(xla::S32, {10, 5});
+  Shape int_shape = ShapeUtil::MakeShape(xla::S32, {});
+  Shape state_tuple_shape =
+      ShapeUtil::MakeTupleShape({int_shape, state_tuple_array_shape});
+  Shape xfeed_shape = ShapeUtil::MakeShape(xla::S32, {2});
+
+  XlaOp some_buffer = Broadcast(ConstantR0<int32_t>(&b, 0), {10, 5});
+  XlaOp num_iter = Infeed(&b, int_shape);
+  XlaOp init_tuple = Tuple(&b, {num_iter, some_buffer});
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_cond, [&] {
+    // Condition: iteration variable > 0
+    XlaBuilder cond_builder("loop_condition");
+    XlaOp state_tuple = Parameter(&cond_builder, 0, state_tuple_shape, "state");
+    XlaOp loop_counter = GetTupleElement(state_tuple, 0);
+    Outfeed(loop_counter, int_shape, "");
+    Gt(loop_counter, ConstantR0<int32_t>(&cond_builder, 0));
+    return cond_builder.Build();
+  }());
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation loop_body, [&] {
+    XlaBuilder body_builder("loop_body");
+    XlaOp state_tuple = Parameter(&body_builder, 0, state_tuple_shape, "state");
+    XlaOp loop_counter = GetTupleElement(state_tuple, 0);
+    XlaOp buffer_inside = GetTupleElement(state_tuple, 1);
+
+    // Read some stuff from Infeed.
+    XlaOp some_input = Infeed(&body_builder, xfeed_shape);
+    XlaOp sum = Add(some_input, Broadcast(loop_counter, {2}));
+    Outfeed(sum, xfeed_shape, "");
+
+    XlaOp iter_left = Sub(loop_counter, ConstantR0<int32_t>(&body_builder, 1));
+
+    Tuple(&body_builder, {iter_left, buffer_inside});
+    return body_builder.Build();
+  }());
+
+  // Build loop.
+  XlaOp result_tuple = While(loop_cond, loop_body, init_tuple);
+  GetTupleElement(result_tuple, 0);
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build());
+
+  std::unique_ptr<xla::Literal> comp_result;
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            comp_result = local_client_->ExecuteAndTransfer(computation, {})
+                              .ConsumeValueOrDie();
+          }));
+
+  VLOG(1) << "Transferring trip count to computation";
+  // Transfer number of iterations to Infeed.
+  TF_ASSERT_OK(
+      local_client_->TransferToInfeed(*LiteralUtil::CreateR0<int32_t>(1)));
+
+  // Pick up value from outfeed
+  {
+    VLOG(1) << "Reading from condition outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&int_shape));
+    EXPECT_EQ(r->Get<int32>({}), 1);
+  }
+
+  VLOG(1) << "Writing data to infeed";
+  // Transfer some stuff to Infeed for use inside of loop.
+  TF_ASSERT_OK(local_client_->TransferToInfeed(
+      *LiteralUtil::CreateR1<int32_t>({10, 20})));
+
+  // Pick up value from outfeed
+  {
+    VLOG(1) << "Reading from body outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&xfeed_shape));
+    EXPECT_EQ(r->Get<int32>({0}), 11);
+    EXPECT_EQ(r->Get<int32>({1}), 21);
+  }
+
+  {
+    VLOG(1) << "Reading from condition outfeed";
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                            local_client_->TransferFromOutfeed(&int_shape));
+    EXPECT_EQ(r->Get<int32>({}), 0);
+  }
+
+  // Joins the thread
+  thread.reset();
+
+  EXPECT_EQ(comp_result->Get<int32>({}), 0);
+}
+
+TEST_F(LocalClientExecuteTest, OutfeedInConditional) {
+  XlaBuilder b(TestName());
+
+  Shape condition_shape = ShapeUtil::MakeShape(xla::PRED, {});
+  Shape result_shape = ShapeUtil::MakeShape(xla::PRED, {});
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation true_computation, [&] {
+    XlaBuilder inner_builder("true_computation");
+    XlaOp param = Parameter(&inner_builder, 0, result_shape, "param");
+    Outfeed(param, result_shape, "");
+    Or(param, param);
+    return inner_builder.Build();
+  }());
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation false_computation, [&] {
+    XlaBuilder inner_builder("false_computation");
+    Parameter(&inner_builder, 0, result_shape, "param");
+    return inner_builder.Build();
+  }());
+
+  XlaOp pred = Infeed(&b, condition_shape);
+  Conditional(/*predicate=*/pred, /*true_operand=*/pred,
+              /*true_computation=*/true_computation, /*false_operand=*/pred,
+              /*false_computation=*/false_computation);
+
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation computation, b.Build());
+
+  std::unique_ptr<xla::Literal> comp_result;
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            comp_result = local_client_->ExecuteAndTransfer(computation, {})
+                              .ConsumeValueOrDie();
+          }));
+
+  TF_ASSERT_OK(
+      local_client_->TransferToInfeed(*LiteralUtil::CreateR0<bool>(true)));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> r,
+                          local_client_->TransferFromOutfeed(&result_shape));
+
+  EXPECT_EQ(r->Get<bool>({}), true);
+
+  // Join the thread
+  thread.reset();
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/pad_test.cc b/tensorflow/compiler/xla/tests/pad_test.cc
index e428fa9b5e14d0cb6e5610a1b69b07c6b0c9952a..ca21b0b2ba590a6daadf2c8d3d9ad213514b0f0f 100644
--- a/tensorflow/compiler/xla/tests/pad_test.cc
+++ b/tensorflow/compiler/xla/tests/pad_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 8ba1d11b33344463ffcb059f453754f31e177184..f6c762e7a4bee91a26c4c2e033c3717fef6d91d0 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/pred_test.cc b/tensorflow/compiler/xla/tests/pred_test.cc
index 5c351b2d113709105244de4aafa49d7cc535ced1..2fc7f816b56db6f57ca835d1847476b6d622ce5e 100644
--- a/tensorflow/compiler/xla/tests/pred_test.cc
+++ b/tensorflow/compiler/xla/tests/pred_test.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 5ebf8344d2b113b15f049b001c044c29c21c9004..029af69573e458a45cf1e446e942c7401cd9e629 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -177,7 +177,7 @@ XLA_TEST_F(PrngTest, Uniformity108) {
   EXPECT_LT(UniformChiSquared(108, 256), 132.144);
 }
 XLA_TEST_F(PrngTest, Uniformity256) {
-  EXPECT_LT(UniformChiSquared(256, 256), 293.248);
+  EXPECT_LT(UniformChiSquared(256, 512), 293.248);
 }
 
 XLA_TEST_F(PrngTest, MapUsingRng) {
diff --git a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
index 526a38e8d1dbed9cdd4a31bfbec49bc5c6bb174b..fab2a65de109c670a6854c0fc1118162acf3d312 100644
--- a/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
+++ b/tensorflow/compiler/xla/tests/query_inferred_shape_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_precision_test.cc b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
index 04c7f316463441d1bd458393b29ea5eb2acb9c9b..531648fe3eb8e3941c5e3c012847ee68c616590f 100644
--- a/tensorflow/compiler/xla/tests/reduce_precision_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_precision_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index 1407fca72fd494c27fa999e67d69ecf36cbff81b..2065271a7f686c52c88df80b0efe8f2e1542d198 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -37,8 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -125,10 +125,10 @@ class ReduceTest : public ClientLibraryTestBase {
     XlaComputation reduce;
     if (and_reduce) {
       init_value = ConstantR0<bool>(&builder, true);
-      reduce = CreateScalarAndComputation(&builder);
+      reduce = CreateScalarAndComputation(PRED, &builder);
     } else {
       init_value = ConstantR0<bool>(&builder, false);
-      reduce = CreateScalarOrComputation(&builder);
+      reduce = CreateScalarOrComputation(PRED, &builder);
     }
     Reduce(pred_values, init_value, reduce,
            /*dimensions_to_reduce=*/{0});
@@ -163,10 +163,10 @@ class ReduceTest : public ClientLibraryTestBase {
     XlaComputation reduce_op;
     if (and_reduce) {
       init_value = ConstantR0<bool>(&builder, true);
-      reduce_op = CreateScalarAndComputation(&builder);
+      reduce_op = CreateScalarAndComputation(PRED, &builder);
     } else {
       init_value = ConstantR0<bool>(&builder, false);
-      reduce_op = CreateScalarOrComputation(&builder);
+      reduce_op = CreateScalarOrComputation(PRED, &builder);
     }
 
     Reduce(input_pred, init_value, reduce_op,
@@ -798,13 +798,17 @@ XLA_TEST_F(ReduceTest, VectorizedReduce_Min) {
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanAnd) {
   RunVectorizedReduceTestForType<bool>(
-      static_cast<FuncGenerator>(CreateScalarAndComputation),
+      static_cast<FuncGenerator>([](XlaBuilder* builder) {
+        return CreateScalarAndComputation(PRED, builder);
+      }),
       [](bool a, bool b) { return a && b; }, true);
 }
 
 XLA_TEST_F(ReduceTest, VectorizedReduce_BooleanOr) {
   RunVectorizedReduceTestForType<bool>(
-      static_cast<FuncGenerator>(CreateScalarOrComputation),
+      static_cast<FuncGenerator>([](XlaBuilder* builder) {
+        return CreateScalarOrComputation(PRED, builder);
+      }),
       [](bool a, bool b) { return a || b; }, false);
 }
 
@@ -963,5 +967,32 @@ XLA_TEST_F(ReduceTest, ReduceIdentity) {
       ErrorSpec(0.0001));
 }
 
+XLA_TEST_F(ReduceTest, AndReduceU64) {
+  XlaBuilder builder(TestName());
+  Array2D<uint64> initializer = {{0x123456789ABCDEF0LL, 0x3BCDEF12A4567890LL},
+                                 {0XFFFFFFFFFFFFFFD6LL, 101},
+                                 {1, 0XFFFFFFFFFFFFFFFFLL}};
+  auto reducer = CreateScalarAndComputation(U64, &builder);
+  auto m = ConstantR2FromArray2D(&builder, initializer);
+  Reduce(m, ConstantR0<uint64>(&builder, 0xFFFFFFFFFFFFFFFFLL), reducer, {1});
+
+  std::vector<uint64> expected = {0x1204461080145890LL, 68, 1};
+  ComputeAndCompareR1<uint64>(&builder, expected, {});
+}
+
+XLA_TEST_F(ReduceTest, OrReduceU64) {
+  XlaBuilder builder(TestName());
+  Array2D<uint64> initializer = {{0x123456789ABCDEF0LL, 0x3BCDEF12A4567890LL},
+                                 {0xFFFFFFFFFFFFFFD6LL, 101},
+                                 {1, 0xCAFEBEEFABABABABLL}};
+  auto reducer = CreateScalarOrComputation(U64, &builder);
+  auto m = ConstantR2FromArray2D(&builder, initializer);
+  Reduce(m, ConstantR0<uint64>(&builder, 0), reducer, {1});
+
+  std::vector<uint64> expected = {0X3BFDFF7ABEFEFEF0LL, 0XFFFFFFFFFFFFFFF7LL,
+                                  0xCAFEBEEFABABABABLL};
+  ComputeAndCompareR1<uint64>(&builder, expected, {});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index c2681f70f7e5727462c20d5eb3120bd34fd75765..1bd6fdab31d6c3516339bdb98459ffe3bbdef1d1 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index d544968648d7602464bd141a12c75eeb8c1678da..d8914513819415368a628eab1f482f9644dd46b1 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index 7c0389cfa3251a6b62f83a78e986d870177d4d91..368f5583c9ce3773e57b858ff7606f679346529a 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 46d91711a55c5aa0ad906bc9ba0265fab194cf1a..382d1b1ae741285dcd1f7761edb82a5c333887af 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/reverse_test.cc b/tensorflow/compiler/xla/tests/reverse_test.cc
index 23f0d26d93bf979970d112993c0a945fb4fe7d53..41e49b4003236d55d85592315652a0ddefd5c485 100644
--- a/tensorflow/compiler/xla/tests/reverse_test.cc
+++ b/tensorflow/compiler/xla/tests/reverse_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 3b603c0d31565c20ff4e43c3ffd6001e9d54612e..e42c71eb284deb2e50d6ea4b47fa707e4bc14ffc 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
index b1f1e69d3cdb9398a2ebc929a344a69adc6c2223..e3d4f98dd7432d1dce7e697586e8b17105dc82e7 100644
--- a/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/select_and_scatter_test.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 59409ab26e1c19a8271318c18e19caa7b8ddc3b7..1c01402798658877889527a5dd02d5c74787ff99 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index a593faca0035b64670f294f81fd5b6d95f35cd88..b8ad6668f80a3002eff3cc458997966ee67c8d4b 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index 8f424ae81f592bfd8accd8decb8fc363f7561c73..a2f0338e25977d7c76dbc48b3afc649b77ba4ee2 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/local_client_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index 0f86b7f20f9bd7597ece713626ee0e9c23509e05..125513ddfd16cb4e742e7d589e22b721307621ee 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -60,7 +61,7 @@ class TransferManagerTest : public LocalClientTestBase {
   }
 
  protected:
-  Backend::StreamPtr stream_ptr_;
+  StreamPool::Ptr stream_ptr_;
   se::Stream* stream_;
 
  private:
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index 6ebb4324f8d20ed9f8886d92b0513441685ed19b..fbe9d1b64aa0c06d65b547c45cfa981800d40ff3 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index bf86c5dfb6a1caf2c8574a5a4f7b77d982039bde..2fd70b72b52f360fc74a73cd13d401b7dac6e708 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -545,5 +546,51 @@ XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
       *result));
 }
 
+// Disabled on interpreter due to lack of outfeed.
+XLA_TEST_F(TupleHloTest,
+           DISABLED_ON_INTERPRETER(NonAmbiguousTopLevelAllocation)) {
+  const char* testcase = R"(
+    HloModule tuple
+
+    ENTRY main {
+      a = f32[2] parameter(0)
+      b = f32[2] parameter(1)
+      c = f32[2] parameter(2)
+      d = f32[2] parameter(3)
+      cond = pred[] parameter(4)
+
+      tup0 = (f32[2],f32[2]) tuple(a, b)
+      tup1 = (f32[2],f32[2]) tuple(c, d)
+
+      s = (f32[2],f32[2]) tuple-select(cond, tup0, tup1)
+      gte = f32[2] get-tuple-element(s), index=0
+      tuple = (f32[2]) tuple(gte)
+      token = token[] after-all()
+      ROOT outfeed = token[] outfeed(tuple, token)
+    }
+  )";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param0 = LiteralUtil::CreateR1<float>({1, 2});
+  auto param1 = LiteralUtil::CreateR1<float>({2, 3});
+  auto param4 = LiteralUtil::CreateR0<bool>(false);
+  // Put execution on a separate thread so we can block on outfeed.
+  std::unique_ptr<tensorflow::Thread> thread(
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions(), "execute_thread", [&] {
+            TF_EXPECT_OK(Execute(std::move(module),
+                                 {param0.get(), param1.get(), param1.get(),
+                                  param0.get(), param4.get()})
+                             .status());
+          }));
+  auto expected =
+      LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1<float>({2, 3}));
+  auto literal = MakeUnique<Literal>();
+  TF_EXPECT_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
+      backend().default_stream_executor(), expected->shape(), literal.get()));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *literal));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index a90a6fb0a5b5bb5119eee93c9c6a1377e3461b46..20ae68ab74026936c43e5f525eb796eb402a19cb 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
index ea3aba6df1d3fbd492a23b280309322b8524c0bf..ef1b1445bbe555da00db4446d59439b752735a80 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_reduce_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 79bae22dac9599a38c73ea1dc2e6b4856395ff79..3848ec1684cdc9186e14ac0b60315b7520d127f3 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 29befef92e44c4f7673e0c7153efad31d2bbc2b1..c81c27891c29394fe01116ca22fa678b0a409c62 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 4d4dd62a3f0426342012b4999c73891c0c601052..0ee8e68c88011d53ab6484e0bd81eb969304b6fb 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -18,10 +18,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
-#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -133,7 +134,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   DeviceMemoryAllocator* allocator = backend->memory_allocator();
   auto* transfer_manager = backend->transfer_manager();
   TF_ASSERT_OK_AND_ASSIGN(
-      Backend::StreamPtr stream_ptr,
+      StreamPool::Ptr stream_ptr,
       backend->BorrowStream(backend->default_device_ordinal()));
 
   TF_ASSERT_OK_AND_ASSIGN(
@@ -172,6 +173,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
       auto execution_result,
       executable->ExecuteOnStream(&run_options, {&lhs_arg, &rhs_arg},
                                   &hlo_execution_profile));
+  TF_ASSERT_OK(stream_ptr->BlockHostUntilDone());
   (void)execution_result;
 
   *profile_output =
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 55501827f29582324ce3308f2a7d96bc20b65760..d7cabbe876c662fc71237a0fb62141c93e69d14b 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -84,6 +85,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:testing",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -164,6 +166,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
@@ -181,6 +184,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_proto",
@@ -198,6 +202,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index befb55453777dce30af89bcaad2ffe1647097576..f20dcef382b86d27d7c176ae7e4132ad1db7b901 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index cfb8f37487d6499b803438a135be54524fcf17d2..f0af0580c1fbca455c6ed5f87f82971faee50a06 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 5dd5150be339846d0775880931f615b92c5b08d8..f03e1b1f965af761c101555fd0275bc0425b9cf0 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index a5dce20456c6a2402f425ebb3d575d1bb625f839..dc5c106d02cb679f3e6f5b2bea40bbb42f8bd1cc 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 854e797ec2e31d32d98f46a75c31ff89caac613b..3bb2f3c0007bbe92aed6a995790284c89719be91 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index a173c51879f415ca98ad67015338ce5ab5e357a5..6a4e252b44881c679350e121b1793e3b797f0785 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -130,8 +130,12 @@ py_library(
         "//tensorflow/contrib/bigtable",  # depends on bigtable
         "//tensorflow/contrib/cloud:cloud_py",  # doesn't compile on Windows
         "//tensorflow/contrib/ffmpeg:ffmpeg_ops_py",
-        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
+        # TODO(aaroey): tensorrt dependency has to appear before tflite so the
+        # build can resolve its flatbuffers symbols within the tensorrt library.
+        # This is an issue with the tensorrt static library and will be fixed by
+        # the next tensorrt release, so fix the order here after that.
         "//tensorflow/contrib/tensorrt:init_py",  # doesn't compile on windows
+        "//tensorflow/contrib/lite/python:lite",  # unix dependency, need to fix code
     ]),
 )
 
diff --git a/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml b/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
index bced47e046db889366bf88e563d086a8c367431a..c17110a78be49f70ef108be79a624d87ad9ed28d 100644
--- a/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
+++ b/tensorflow/contrib/android/cmake/src/main/AndroidManifest.xml
@@ -1,6 +1,10 @@
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
     package="org.tensorflow.contrib.android">
 
+    <uses-sdk
+        android:minSdkVersion="4"
+        android:targetSdkVersion="19" />
+
     <application android:allowBackup="true" android:label="@string/app_name"
         android:supportsRtl="true">
 
diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md
index 679ab48e5cf46a601f8b550773ca2b3f6c04957d..cc54da4daa9a5bb4e64145963ffec63021d08876 100644
--- a/tensorflow/contrib/autograph/README.md
+++ b/tensorflow/contrib/autograph/README.md
@@ -1,6 +1,6 @@
 # AutoGraph
 
-IMPORTANT: AutoGraph is alpha software, and under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! We'd also love contributions ([please see our contributing guidelines](CONTRIBUTING.md) and our [style guide](STYLE_GUIDE.md)).
+IMPORTANT: AutoGraph is beta software, and under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! We'd also love contributions ([please see our contributing guidelines](CONTRIBUTING.md) and our [style guide](STYLE_GUIDE.md)).
 
 AutoGraph is a Python to TensorFlow compiler.
 
@@ -68,12 +68,21 @@ Then import the `autograph` module from `tf.contrib`:
 from tensorflow.contrib import autograph as ag
 ```
 
-### Interactive demo notebooks
+### Related links
 
-For more extensive examples, check out these interactive notebooks:
+Articles:
 
- * [RNN trained using Keras and Estimators](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
+ * [TensorFlow blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7)
+
+Interactive notebooks:
+
+ * [Quick guide](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/guide/autograph.ipynb)
+ * [RNN trained using Keras and Estimators](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
  * [Demo from the TF Dev Summit 2018](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb)
+ * [Basic control flow speed test](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/ag_vs_eager_collatz_speed_test.ipynb)
+ * [MNIST training speed test](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/ag_vs_eager_mnist_speed_test.ipynb)
+ * [Basic algorithm samples](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb)
+ * [Introductory workshop support notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/workshop.ipynb)
 
 ## Using with annotations
 
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 7821c98f1c35df0ab60484e5f7255f2955ce80a1..26e7a4a4d38e264486c981e6fc4c547bcc53b302 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -22,20 +22,21 @@ from __future__ import division
 from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
-from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph import operators
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.core.errors import GraphConstructionError
+from tensorflow.contrib.autograph.core.errors import TfRuntimeError
+from tensorflow.contrib.autograph.core.errors import improved_errors
+from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
-from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import to_code
-from tensorflow.contrib.autograph.core.errors import improved_errors
-from tensorflow.contrib.autograph.core.errors import GraphConstructionError
-from tensorflow.contrib.autograph.core.errors import TfRuntimeError
 from tensorflow.contrib.autograph.impl.api import to_graph
 from tensorflow.contrib.autograph.lang.directives import set_element_type
 from tensorflow.contrib.autograph.lang.directives import set_loop_options
 from tensorflow.contrib.autograph.lang.special_functions import stack
+from tensorflow.contrib.autograph.lang.special_functions import tensor_list
 from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
@@ -57,6 +58,7 @@ _allowed_symbols = [
     'set_element_type',
     'set_loop_options',
     'stack',
+    'tensor_list',
     # Exceptions
     'AutographParseError',
     # Utilities: to be removed
diff --git a/tensorflow/contrib/autograph/converters/asserts_test.py b/tensorflow/contrib/autograph/converters/asserts_test.py
index 9c58ae3accaf7e3bca91750ecaf07845fafdbbae..38faba45df6746d56933a1647594af133b671628 100644
--- a/tensorflow/contrib/autograph/converters/asserts_test.py
+++ b/tensorflow/contrib/autograph/converters/asserts_test.py
@@ -35,7 +35,7 @@ class AssertsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = asserts.transform(node, ctx)
 
-    self.assertTrue(isinstance(node.body[0].body[0].value, gast.Call))
+    self.assertTrue(isinstance(node.body[0].value, gast.Call))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/converters/directives_test.py b/tensorflow/contrib/autograph/converters/directives_test.py
index 5f798a5b76dc42b0553cf8bffcf2c6227aabc67c..a573ba5850609f65ea60432470485c523cd3da3b 100644
--- a/tensorflow/contrib/autograph/converters/directives_test.py
+++ b/tensorflow/contrib/autograph/converters/directives_test.py
@@ -38,7 +38,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.DEFINITIONS)
     d = def_.directives[directives.set_element_type]
     self.assertEqual(d['dtype'].s, 'a')
@@ -52,7 +52,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     d = def_.directives[directives.set_element_type]
     self.assertEqual(d['dtype'].n, 1)
     self.assertEqual(d['shape'].n, 2)
@@ -67,7 +67,7 @@ class DirectivesTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {'directives': directives})
     node = directives_converter.transform(node, ctx)
 
-    d = anno.getanno(node.body[0].body[1], AgAnno.DIRECTIVES)
+    d = anno.getanno(node.body[1], AgAnno.DIRECTIVES)
     d = d[directives.set_loop_options]
     self.assertEqual(d['parallel_iterations'].n, 10)
     self.assertEqual(d['back_prop'].id, 'a')
diff --git a/tensorflow/contrib/autograph/converters/error_handlers_test.py b/tensorflow/contrib/autograph/converters/error_handlers_test.py
index 878526c8b4825080af900a9e838f2d557e8d5273..cd74e5f18f76d0c531f487bc0c736b421c9c3fb4 100644
--- a/tensorflow/contrib/autograph/converters/error_handlers_test.py
+++ b/tensorflow/contrib/autograph/converters/error_handlers_test.py
@@ -34,11 +34,13 @@ class ErrorHandlersTest(converter_testing.TestCase):
       raise ValueError()
 
     node, ctx = self.prepare(test_fn, {})
-    anno.setanno(node.body[0], anno.Basic.ORIGIN,
-                 origin_info.OriginInfo('test_path', None, None, None, None))
+    anno.setanno(node, anno.Basic.ORIGIN,
+                 origin_info.OriginInfo(None, None, None))
     node = error_handlers.transform(node, ctx)
     with self.compiled(node, {}) as result:
       with self.assertRaises(errors.GraphConstructionError):
+        # Here we just assert that the handler works. Its correctness is
+        # verified by errors_test.py.
         result.test_fn()
 
   def test_no_origin_annotation(self):
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 447a88bbe2bcddfce3c7979e773f96fdd6e75108..996e99ee61b3713a03ff167b892101fca35eaeac 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.contrib.autograph.core import converter_testing
 from tensorflow.contrib.autograph.lang import directives
+from tensorflow.contrib.autograph.lang import special_functions
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.framework import dtypes
@@ -52,20 +53,18 @@ class ListTest(converter_testing.TestCase):
       return [1, 2, 3]
 
     with self.converted(test_fn, lists, {}) as result:
-      with self.test_session() as sess:
-        tl = result.test_fn()
-        r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+      self.assertAllEqual(result.test_fn(), [1, 2, 3])
 
   def test_list_append(self):
 
     def test_fn():
-      l = [1]
+      l = special_functions.tensor_list([1])
       l.append(2)
       l.append(3)
       return l
 
-    with self.converted(test_fn, lists, {}) as result:
+    ns = {'special_functions': special_functions}
+    with self.converted(test_fn, lists, ns) as result:
       with self.test_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
@@ -74,12 +73,13 @@ class ListTest(converter_testing.TestCase):
   def test_list_pop(self):
 
     def test_fn():
-      l = [1, 2, 3]
+      l = special_functions.tensor_list([1, 2, 3])
       s = l.pop()
       return s, l
 
-    node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    ns = {'special_functions': special_functions}
+    node, ctx = self.prepare(test_fn, ns)
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.ORIG_DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32'),
@@ -87,7 +87,7 @@ class ListTest(converter_testing.TestCase):
     }
     node = lists.transform(node, ctx)
 
-    with self.compiled(node, {}, dtypes.int32) as result:
+    with self.compiled(node, ns, dtypes.int32) as result:
       with self.test_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
@@ -114,7 +114,7 @@ class ListTest(converter_testing.TestCase):
       return tf.stack(l)
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].targets[0],
                          anno.Static.ORIG_DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
diff --git a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
index de1874321ec24a6a5d3625e8001df08b8e6edb1b..bee512abbc2e115d69bc9a5d53b6c54d428cc73a 100644
--- a/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/contrib/autograph/converters/side_effect_guards_test.py
@@ -43,7 +43,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.test_session() as sess:
@@ -64,7 +64,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.test_session() as sess:
@@ -84,7 +84,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, control_flow_ops.Assert) as result:
       with self.test_session() as sess:
@@ -104,7 +104,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.test_session() as sess:
@@ -125,7 +125,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body[0].body), 1)
+    self.assertEqual(len(node.body[0].body), 1)
 
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.test_session() as sess:
@@ -147,7 +147,7 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     node, ctx = self.prepare(test_fn, {})
     node = side_effect_guards.transform(node, ctx)
 
-    self.assertEqual(len(node.body[0].body), 1)
+    self.assertEqual(len(node.body), 1)
 
     with self.compiled(node, {}, state_ops.assign,
                        state_ops.assign_add) as result:
diff --git a/tensorflow/contrib/autograph/converters/slices.py b/tensorflow/contrib/autograph/converters/slices.py
index 9cfa0666729a2dffd08de78734b9aaecdc6ce576..c527f98613a2ffebf35141d4dac85e972a89c93b 100644
--- a/tensorflow/contrib/autograph/converters/slices.py
+++ b/tensorflow/contrib/autograph/converters/slices.py
@@ -36,12 +36,14 @@ class SliceTransformer(converter.Base):
   def _process_single_assignment(self, target, value):
     if not isinstance(target, gast.Subscript):
       return None
+    if not isinstance(target.slice, gast.Index):
+      return None
 
     template = """
       target = ag__.set_item(target, key, item)
     """
     return templates.replace(
-        template, target=target.value, key=target.slice, item=value)
+        template, target=target.value, key=target.slice.value, item=value)
 
   def visit_Assign(self, node):
     node = self.generic_visit(node)
@@ -76,7 +78,7 @@ class SliceTransformer(converter.Base):
           opts=ag__.GetItemOpts(element_dtype=dtype))
     """
     return templates.replace_as_expression(
-        template, target=node.value, key=node.slice, dtype=dtype)
+        template, target=node.value, key=node.slice.value, dtype=dtype)
 
 
 def transform(node, ctx):
diff --git a/tensorflow/contrib/autograph/converters/slices_test.py b/tensorflow/contrib/autograph/converters/slices_test.py
index 3c0f81e8bc0aa888fca672142e22a661cc813b87..c822d53a4a2810755fd6841af85544dd8fc76a5e 100644
--- a/tensorflow/contrib/autograph/converters/slices_test.py
+++ b/tensorflow/contrib/autograph/converters/slices_test.py
@@ -38,7 +38,7 @@ class SliceTest(converter_testing.TestCase):
       return l[1]
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
     }
@@ -59,11 +59,11 @@ class SliceTest(converter_testing.TestCase):
       return l[1]
 
     node, ctx = self.prepare(test_fn, {})
-    def_, = anno.getanno(node.body[0].args.args[0], anno.Static.DEFINITIONS)
+    def_, = anno.getanno(node.args.args[0], anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.int32')
     }
-    def_, = anno.getanno(node.body[0].body[0].body[0].targets[0],
+    def_, = anno.getanno(node.body[0].body[0].targets[0],
                          anno.Static.DEFINITIONS)
     def_.directives[directives.set_element_type] = {
         'dtype': parser.parse_expression('tf.float32')
diff --git a/tensorflow/contrib/autograph/core/converter_testing.py b/tensorflow/contrib/autograph/core/converter_testing.py
index 2025e32817c11defa2818618e066eedc92c5db40..5ee2c3fffd7474cb8ca28349385a9d543e92a72d 100644
--- a/tensorflow/contrib/autograph/core/converter_testing.py
+++ b/tensorflow/contrib/autograph/core/converter_testing.py
@@ -94,7 +94,8 @@ class TestCase(test.TestCase):
       return 7
 
     try:
-      result, source = compiler.ast_to_object(node)
+      result, source = compiler.ast_to_object(node, include_source_map=True)
+
       result.tf = self.make_fake_mod('fake_tf', *symbols)
       fake_ag = self.make_fake_mod('fake_ag', converted_call)
       fake_ag.__dict__.update(operators.__dict__)
@@ -144,6 +145,7 @@ class TestCase(test.TestCase):
               recursive=True,
               autograph_decorators=()):
     node, source = parser.parse_entity(test_fn)
+    node = node.body[0]
     if namer is None:
       namer = FakeNamer()
     program_ctx = converter.ProgramContext(
diff --git a/tensorflow/contrib/autograph/core/errors.py b/tensorflow/contrib/autograph/core/errors.py
index e58745337a3faac5e9f351174465443fa52fd6bc..c219b372c13f2870ebde2d35c50dcc1fb270490c 100644
--- a/tensorflow/contrib/autograph/core/errors.py
+++ b/tensorflow/contrib/autograph/core/errors.py
@@ -31,11 +31,14 @@ import logging
 import sys
 import traceback
 
-from tensorflow.contrib.autograph.pyct.origin_info import CodeLocation
+from tensorflow.contrib.autograph.pyct import origin_info
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.util import tf_inspect
 
 
+# TODO(mdan): Add a superclass common to all errors.
+
+
 class GraphConstructionError(Exception):
   """Error for graph construction errors from AutoGraph generated code."""
 
@@ -65,27 +68,35 @@ class TfRuntimeError(Exception):
     return message + ''.join(traceback.format_list(self.custom_traceback))
 
 
-def _rewrite_frame(source_map, cleaned_traceback, stack_frame_indices):
-  """Rewrites the stack frames at the given indices using the given source map.
+def _rewrite_tb(source_map, tb, filter_function_name=None):
+  """Rewrites code references in a traceback.
 
   Args:
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
-    cleaned_traceback: List[Tuple[text, text, text, text]], the current
-        traceback.
-    stack_frame_indices: Iterable[Int], frame indices to possibly rewrite if
-        there are matching source mapping keys.
-
+    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo], mapping
+        locations to their origin
+    tb: List[Tuple[Text, Text, Text, Text]], consistent with
+        traceback.extract_tb
+    filter_function_name: Optional[Text], allows restricting restricts the
+        frames to rewrite to a particular function name
   Returns:
-    None
+    List[Tuple[Text, Text, Text, Text]], the rewritten traceback
   """
-  for frame_index in stack_frame_indices:
-    # (file_path, line number, function name, code)
-    file_path, line_number, _, _ = cleaned_traceback[frame_index]
-    source_map_key = CodeLocation(file_path=file_path, line_number=line_number)
-    found_mapping = source_map_key in source_map
-    if found_mapping:
-      cleaned_traceback[frame_index] = source_map[source_map_key].as_frame()
+  new_tb = []
+  for frame in tb:
+    filename, lineno, function_name, _ = frame
+    loc = origin_info.LineLocation(filename, lineno)
+    origin = source_map.get(loc)
+    # TODO(mdan): We shouldn't need the function name at all.
+    # filename + lineno should be sufficient, even if there are multiple source
+    # maps.
+    if origin is not None:
+      if filter_function_name == function_name or filter_function_name is None:
+        new_tb.append(origin.as_frame())
+      else:
+        new_tb.append(frame)
+    else:
+      new_tb.append(frame)
+  return new_tb
 
 
 # TODO(znado): Make more robust to name changes in the rewriting logic.
@@ -98,18 +109,20 @@ def _remove_rewrite_frames(tb):
   return cleaned_tb
 
 
+# TODO(mdan): rename to raise_*
 def rewrite_graph_construction_error(source_map):
   """Rewrites errors raised by non-AG APIs inside AG generated code.
 
-  Meant to be called from the try/except block inside each AutoGraph generated
-  function.  Only rewrites the traceback frames corresponding to the function
-  that this is called from.  When we raise a GraphConstructionError at the end
-  it is then caught by calling functions, where they can be responsible for
-  rewriting their own frames.
+  This is called from the except handler inside an AutoGraph generated function
+  (that is, during exception handling). Only rewrites the frames corresponding
+  to the function that this is called from, so each function is responsible
+  to call this to have its own frames rewritten.
+
+  This function always raises an error.
 
   Args:
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
+    source_map: Dict[origin_info.Location, origin_info.OriginInfo], the source
+        map belonging to the calling function
 
   Raises:
     GraphConstructionError: The rewritten underlying error.
@@ -120,31 +133,19 @@ def rewrite_graph_construction_error(source_map):
   assert original_error is not None
   try:
     _, _, _, func_name, _, _ = tf_inspect.stack()[1]
-    # The latest function call is added to the beginning of a traceback, but
-    # when rewriting the traceback of multiple function calls the previous
-    # functions' except blocks may have already rewritten their own frames so
-    # we want to copy over all of the previous frames. We may have rewritten
-    # previous frames only if the error is a GraphConstructionError.
     if isinstance(original_error, GraphConstructionError):
+      # TODO(mdan): This is incomplete.
+      # The error might have bubbled through a non-converted function.
       cleaned_traceback = traceback.extract_tb(e_traceback)
       previous_traceback = original_error.custom_traceback
       cleaned_traceback = [cleaned_traceback[0]] + previous_traceback
     else:
       cleaned_traceback = traceback.extract_tb(e_traceback)
-    cleaned_traceback = _remove_rewrite_frames(cleaned_traceback)
-
-    current_frame_indices = []
-    # This code is meant to be called from the try/except block that wraps a
-    # function body.  Here we look for all frames that came from the function
-    # that this wraps, look for any matching line numbers in the source
-    # mapping, and then rewrite them if matches are found.
-    for fi, frame in enumerate(cleaned_traceback):
-      _, _, frame_func_name, _ = frame
-      if frame_func_name == func_name:
-        current_frame_indices.append(fi)
-        break
-    if current_frame_indices:
-      _rewrite_frame(source_map, cleaned_traceback, current_frame_indices)
+
+    # Remove the frame corresponding to this function call.
+    cleaned_traceback = cleaned_traceback[1:]
+
+    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback, func_name)
 
     if isinstance(original_error, GraphConstructionError):
       original_error.custom_traceback = cleaned_traceback
@@ -153,6 +154,7 @@ def rewrite_graph_construction_error(source_map):
       new_error = GraphConstructionError(original_error, cleaned_traceback)
   except Exception:
     logging.exception('Error while rewriting AutoGraph error:')
+    # TODO(mdan): Should reraise here, removing the top frame as well.
     raise original_error
   else:
     raise new_error
@@ -161,18 +163,17 @@ def rewrite_graph_construction_error(source_map):
     del e_traceback
 
 
+# TODO(mdan): This should be consistent with rewrite_graph_construction_error
+# Both should either raise or return.
 def rewrite_tf_runtime_error(error, source_map):
   """Rewrites TensorFlow runtime errors raised by ops created in AG code.
 
   Args:
-    error: error_impl.OpError, an TensorFlow error that will have its traceback
-        rewritten.
-    source_map: Dict[CodeLocation, OriginInfo], a mapping between the user and
-        AG generated code.
+    error: tf.OpError
+    source_map: Dict[origin_info.LineLocation, origin_info.OriginInfo]
 
   Returns:
-    A TfRuntimeError with a traceback rewritten according to the given
-    source mapping.
+    TfRuntimeError, the rewritten underlying error.
   """
   # Check for cases where we leave a user method and re-enter it in the
   # traceback.  This is done by looking at the function names when the
@@ -198,15 +199,16 @@ def rewrite_tf_runtime_error(error, source_map):
   # The source map keys are (file_path, line_number) so get the set of all user
   # file_paths.
   try:
-    all_user_files = set(k.file_path for k in source_map)
+    all_user_files = set(loc.filename for loc in source_map)
     cleaned_traceback = []
     last_user_frame_index = None
     last_user_user_file_path = None
     last_user_user_fn_name = None
+    # TODO(mdan): Simplify this logic.
     for fi, frame in enumerate(error.op.traceback):
-      frame_file_path, frame_line_number, _, _ = frame
-      src_map_key = CodeLocation(
-          file_path=frame_file_path, line_number=frame_line_number)
+      frame_file_path, lineno, _, _ = frame
+      lineno -= 1  # Frame line numbers are 1-based.
+      src_map_key = origin_info.LineLocation(frame_file_path, lineno)
       if frame_file_path in all_user_files:
         if src_map_key in source_map:
           original_fn_name = source_map[src_map_key].function_name
@@ -223,8 +225,8 @@ def rewrite_tf_runtime_error(error, source_map):
         last_user_user_file_path = frame_file_path
       cleaned_traceback.append(frame)
 
-    for fi in range(len(cleaned_traceback)):
-      _rewrite_frame(source_map, cleaned_traceback, [fi])
+    cleaned_traceback = _rewrite_tb(source_map, cleaned_traceback)
+
     op_name = error.op.name
     op_message = error.message
     rewritten_error = TfRuntimeError(op_name, op_message, cleaned_traceback)
@@ -263,7 +265,7 @@ def improved_errors(converted_function):
     ValueError: If converted_function is not generated by AutoGraph
   """
   if (getattr(converted_function, 'ag_source_map', None) is None or
-      not converted_function.ag_source_map):
+      not isinstance(converted_function.ag_source_map, dict)):
     raise ValueError(
         'converted_function must be the result of an autograph.to_graph call')
   try:
diff --git a/tensorflow/contrib/autograph/core/errors_test.py b/tensorflow/contrib/autograph/core/errors_test.py
index 7be54563a1a86a56437f4da2941bf5187ce813a9..c0e2c74e47ddfb8ee812d6d839b06784e7a01dba 100644
--- a/tensorflow/contrib/autograph/core/errors_test.py
+++ b/tensorflow/contrib/autograph/core/errors_test.py
@@ -28,88 +28,76 @@ from tensorflow.python.util import tf_inspect
 
 
 def zero_div():
-  return array_ops.constant(10, dtype=dtypes.int32) // 0
+  x = array_ops.constant(10, dtype=dtypes.int32)
+  return x // 0
 
 
 def zero_div_caller():
-  a = zero_div() + 2
-  return a
+  return zero_div()
 
 
 class RuntimeErrorsTest(test.TestCase):
 
-  def setUp(self):
-    self._fake_origin = origin_info.OriginInfo('new file', 'new func', 96, 0,
-                                               'print("hello world!")')
-
-  def test_error_replacement(self):
-    _, zero_div_lineno = tf_inspect.getsourcelines(zero_div)
-    src_map = {
-        errors.CodeLocation(
-            file_path=__file__, line_number=zero_div_lineno + 1):
-            self._fake_origin
-    }
+  def fake_origin(self, function, line_offset):
+    _, lineno = tf_inspect.getsourcelines(function)
+    filename = tf_inspect.getsourcefile(function)
+    lineno += line_offset
+    loc = origin_info.LineLocation(filename, lineno)
+    origin = origin_info.OriginInfo(loc, 'test_function_name', 'test_code')
+    return loc, origin
+
+  def test_improved_errors_basic(self):
+    loc, origin = self.fake_origin(zero_div, 2)
+    zero_div_caller.ag_source_map = {loc: origin}
+
+    ops = zero_div_caller()
     with self.assertRaises(errors.TfRuntimeError) as cm:
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
-    expected = cm.exception
-    current_traceback = expected.custom_traceback
-    for frame in current_traceback:
-      self.assertNotEqual('zero_div', frame[2])
-    self.assertTrue(
-        any(self._fake_origin.as_frame() == frame
-            for frame in current_traceback))
-
-  def test_error_not_found(self):
-    src_map = {
-        errors.CodeLocation(file_path=__file__, line_number=-1):
-            self._fake_origin
-    }
+          sess.run(ops)
+
+    for frame in cm.exception.custom_traceback:
+      _, _, function_name, _ = frame
+      self.assertNotEqual('zero_div', function_name)
+    self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
+
+  def test_improved_errors_no_matching_lineno(self):
+    loc, origin = self.fake_origin(zero_div, -1)
+    zero_div_caller.ag_source_map = {loc: origin}
+
+    ops = zero_div_caller()
     with self.assertRaises(errors.TfRuntimeError) as cm:
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
-    expected = cm.exception
-    current_traceback = expected.custom_traceback
-    self.assertTrue(any('zero_div' in frame[2] for frame in current_traceback))
-    for frame in current_traceback:
-      self.assertNotEqual(frame, self._fake_origin.as_frame())
-
-  def test_rewriting_error(self):
-    _, zero_div_lineno = tf_inspect.getsourcelines(zero_div)
-    src_map = {
-        errors.CodeLocation(
-            file_path=__file__, line_number=zero_div_lineno + 1):
-            None
-    }
-    with self.assertRaisesRegexp(tf_errors.InvalidArgumentError,
-                                 'Integer division by zero'):
-      z = zero_div_caller()
-      zero_div_caller.ag_source_map = src_map
+          sess.run(ops)
+
+    all_function_names = set()
+    for frame in cm.exception.custom_traceback:
+      _, _, function_name, _ = frame
+      all_function_names.add(function_name)
+      self.assertNotEqual('test_function_name', function_name)
+    self.assertIn('zero_div', all_function_names)
+
+  def test_improved_errors_failures(self):
+    loc, _ = self.fake_origin(zero_div, 2)
+    zero_div_caller.ag_source_map = {loc: 'bogus object'}
+
+    ops = zero_div_caller()
+    with self.assertRaises(tf_errors.InvalidArgumentError):
       with errors.improved_errors(zero_div_caller):
         with self.test_session() as sess:
-          sess.run(z)
+          sess.run(ops)
 
-  def test_no_ag_source_map(self):
+  def test_improved_errors_validation(self):
     with self.assertRaisesRegexp(
         ValueError,
         'converted_function must be the result of an autograph.to_graph call'):
-      with errors.improved_errors(None):
-        pass
-
-  def test_bad_ag_source_map(self):
+      errors.improved_errors(zero_div).__enter__()
     with self.assertRaisesRegexp(
         ValueError,
         'converted_function must be the result of an autograph.to_graph call'):
-      src_map = None
-      zero_div_caller.ag_source_map = src_map
-      with errors.improved_errors(None):
-        pass
+      zero_div_caller.ag_source_map = 'not a dict'
+      errors.improved_errors(zero_div_caller).__enter__()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/BUILD b/tensorflow/contrib/autograph/examples/integration_tests/BUILD
index 1368ce244c26a95d1ee7cc4708891badc7981db7..d20c17b63b923458952dbfdb1e07e808cf6a36ff 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/BUILD
+++ b/tensorflow/contrib/autograph/examples/integration_tests/BUILD
@@ -22,7 +22,19 @@ py_test(
         "keras_test.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
+    tags = ["no_windows"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "list_literals_test",
+    srcs = [
+        "list_literals_test.py",
+    ],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
index a2fc7c550efe829a104dc3931f29cc4f8fcf60d4..73125eb452fc3f3f94a8323d677341345931c4ea 100644
--- a/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
+++ b/tensorflow/contrib/autograph/examples/integration_tests/keras_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import tensorflow as tf
 
+from tensorflow.contrib import autograph
+
 
 class MinimalKeras(tf.keras.Model):
 
@@ -27,11 +29,34 @@ class MinimalKeras(tf.keras.Model):
     return x * 3
 
 
+class ModelWithStaticConditional(object):
+
+  def __init__(self, initial):
+    self.initial = initial
+    if self.initial:
+      self.h = 15
+
+  @autograph.convert()
+  def call(self):
+    x = 10
+    if self.initial:
+      x += self.h
+    return x
+
+
 class KerasTest(tf.test.TestCase):
 
   def test_basic(self):
     MinimalKeras()
 
+  def test_conditional_attributes_False(self):
+    model = ModelWithStaticConditional(False)
+    self.assertEqual(model.call(), 10)
+
+  def test_conditional_attributes_True(self):
+    model = ModelWithStaticConditional(True)
+    self.assertEqual(model.call(), 25)
+
 
 if __name__ == '__main__':
   tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..680b6dbaf07fc10e11dfa1e9d3a075624024c103
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/integration_tests/list_literals_test.py
@@ -0,0 +1,41 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests of functions that use list literals."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from tensorflow.contrib import autograph as ag
+
+
+def list_used_as_tuple():
+  return tf.constant([1, 2, 3])
+
+
+class ListLiteralsTest(tf.test.TestCase):
+
+  def test_basic(self):
+    converted = ag.to_graph(list_used_as_tuple)
+    result = converted()
+
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(result), [1, 2, 3])
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..bf824e2760e694ae3c00c9f08d9aa5d5522a9b84
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb
@@ -0,0 +1,1512 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "b9R-4ezU3NH0"
+      },
+      "source": [
+        "## AutoGraph: examples of simple algorithms\n",
+        "\n",
+        "This notebook shows how you can use AutoGraph to compile simple algorithms and run them in TensorFlow.\n",
+        "\n",
+        "It requires the nightly build of TensorFlow, which is installed below."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "TuWj26KWz1fZ"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U -q tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "3kudk1elq0Gh"
+      },
+      "source": [
+        "### Fibonacci numbers\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Fibonacci_number"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 197
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 7512,
+          "status": "ok",
+          "timestamp": 1532101577266,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "H7olFlMXqrHe",
+        "outputId": "472dbfe0-9449-4f93-e908-1a0785188a92"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0 :  1\n",
+            "1 :  2\n",
+            "2 :  3\n",
+            "3 :  5\n",
+            "4 :  8\n",
+            "5 :  13\n",
+            "6 :  21\n",
+            "7 :  34\n",
+            "8 :  55\n",
+            "9 :  89\n"
+          ]
+        }
+      ],
+      "source": [
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "\n",
+        "\n",
+        "def fib(n):\n",
+        "  f1 = 0\n",
+        "  f2 = 1\n",
+        "  for i in range(n):\n",
+        "    tmp = f2\n",
+        "    f2 = f2 + f1\n",
+        "    f1 = tmp\n",
+        "    print(i, ': ', f2)\n",
+        "  return f2\n",
+        "\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  final_fib = ag.to_graph(fib)(tf.constant(10))\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(final_fib)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "p8zZyj-tq4K3"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 541
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 103,
+          "status": "ok",
+          "timestamp": 1532101577412,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "UeWjK8rHq6Cj",
+        "outputId": "73ece895-12fb-489a-e52c-032945d7ed7a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__fib(n):\n",
+            "  try:\n",
+            "    with tf.name_scope('fib'):\n",
+            "      f1 = 0\n",
+            "      f2 = 1\n",
+            "\n",
+            "      def extra_test(f1_1, f2_1):\n",
+            "        with tf.name_scope('extra_test'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body(i, f1_1, f2_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          tmp = f2_1\n",
+            "          f2_1 = f2_1 + f1_1\n",
+            "          f1_1 = tmp\n",
+            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
+            "              dynamic_print(i, ': ', f2_1)):\n",
+            "            f2, i_1 = ag__.utils.alias_tensors(f2_1, i)\n",
+            "            return f1_1, f2\n",
+            "      f1, f2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range, n),\n",
+            "          extra_test, loop_body, (f1, f2))\n",
+            "      return f2\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(fib))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eIfVy6ZTrFEH"
+      },
+      "source": [
+        "### Fizz Buzz\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Fizz_buzz"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 125
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 233,
+          "status": "ok",
+          "timestamp": 1532101577681,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "33CAheYsrEQ7",
+        "outputId": "82a493ee-15b5-419d-8c9c-5f4159090a05"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Buzz\n",
+            "11\n",
+            "Fizz\n",
+            "13\n",
+            "14\n",
+            "FizzBuzz\n"
+          ]
+        }
+      ],
+      "source": [
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "\n",
+        "def fizzbuzz(i, n):\n",
+        "  while i \u003c n:\n",
+        "    msg = ''\n",
+        "    if i % 3 == 0:\n",
+        "      msg += 'Fizz'\n",
+        "    if i % 5 == 0:\n",
+        "      msg += 'Buzz'\n",
+        "    if msg == '':\n",
+        "      msg = tf.as_string(i)\n",
+        "    print(msg)\n",
+        "    i += 1\n",
+        "  return i\n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  final_i = ag.to_graph(fizzbuzz)(tf.constant(10), tf.constant(16))\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(final_i)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Lkq3DBGOv3fA"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 1081
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 289,
+          "status": "ok",
+          "timestamp": 1532101578003,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "bBhFIIaZrxvx",
+        "outputId": "d076a7ea-e643-4689-f90a-57f5d086dedc"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__fizzbuzz(i, n):\n",
+            "  try:\n",
+            "    with tf.name_scope('fizzbuzz'):\n",
+            "\n",
+            "      def loop_test(i_1):\n",
+            "        with tf.name_scope('loop_test'):\n",
+            "          return tf.less(i_1, n)\n",
+            "\n",
+            "      def loop_body(i_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          msg = ''\n",
+            "\n",
+            "          def if_true():\n",
+            "            with tf.name_scope('if_true'):\n",
+            "              msg_1, = msg,\n",
+            "              msg_1 += 'Fizz'\n",
+            "              return msg_1,\n",
+            "\n",
+            "          def if_false():\n",
+            "            with tf.name_scope('if_false'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 3, 0), if_true, if_false)\n",
+            "\n",
+            "          def if_true_1():\n",
+            "            with tf.name_scope('if_true_1'):\n",
+            "              msg_2, = msg,\n",
+            "              msg_2 += 'Buzz'\n",
+            "              return msg_2,\n",
+            "\n",
+            "          def if_false_1():\n",
+            "            with tf.name_scope('if_false_1'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(i_1 % 5, 0), if_true_1, if_false_1\n",
+            "              )\n",
+            "\n",
+            "          def if_true_2():\n",
+            "            with tf.name_scope('if_true_2'):\n",
+            "              msg_3, = msg,\n",
+            "              msg_3 = tf.as_string(i_1)\n",
+            "              return msg_3,\n",
+            "\n",
+            "          def if_false_2():\n",
+            "            with tf.name_scope('if_false_2'):\n",
+            "              return msg,\n",
+            "          msg = ag__.utils.run_cond(tf.equal(msg, ''), if_true_2, if_false_2)\n",
+            "          with ag__.utils.control_dependency_on_returns(ag__.utils.\n",
+            "              dynamic_print(msg)):\n",
+            "            msg_4 = ag__.utils.alias_tensors(msg)\n",
+            "            i_1 += 1\n",
+            "            return i_1,\n",
+            "      i = ag__.while_stmt(loop_test, loop_body, (i,), (tf, n, ag__, i))\n",
+            "      return i\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(fizzbuzz))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BNRtprSvwJgk"
+      },
+      "source": [
+        "### Conway's Game of Life\n",
+        "\n",
+        "https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "r8_0ioEuAI-a"
+      },
+      "source": [
+        "#### Testing boilerplate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "7moIlf8VABkl"
+      },
+      "outputs": [],
+      "source": [
+        "NUM_STEPS = 1"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "QlEvfIQPAYF5"
+      },
+      "source": [
+        "#### Game of Life for AutoGraph"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "5pCK2qQSAAK4"
+      },
+      "outputs": [],
+      "source": [
+        "#@test {\"skip\": true} \n",
+        "NUM_STEPS = 100"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 308
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 14892,
+          "status": "ok",
+          "timestamp": 1532101593030,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "hC3qMqryPDHS",
+        "outputId": "8405c0e9-e518-41d6-f5bc-e78df6474169"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "\u003cvideo width=\"432.0\" height=\"288.0\" controls autoplay loop\u003e\n",
+              "  \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAACZUm1kYXQAAAKuBgX//6rcRem9\n",
+              "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTQ4IHIyNzk1IGFhYTlhYTggLSBILjI2NC9NUEVHLTQg\n",
+              "QVZDIGNvZGVjIC0gQ29weWxlZnQgMjAwMy0yMDE3IC0gaHR0cDovL3d3dy52aWRlb2xhbi5vcmcv\n",
+              "eDI2NC5odG1sIC0gb3B0aW9uczogY2FiYWM9MSByZWY9MyBkZWJsb2NrPTE6MDowIGFuYWx5c2U9\n",
+              "MHgzOjB4MTEzIG1lPWhleCBzdWJtZT03IHBzeT0xIHBzeV9yZD0xLjAwOjAuMDAgbWl4ZWRfcmVm\n",
+              "PTEgbWVfcmFuZ2U9MTYgY2hyb21hX21lPTEgdHJlbGxpcz0xIDh4OGRjdD0xIGNxbT0wIGRlYWR6\n",
+              "b25lPTIxLDExIGZhc3RfcHNraXA9MSBjaHJvbWFfcXBfb2Zmc2V0PS0yIHRocmVhZHM9OSBsb29r\n",
+              "YWhlYWRfdGhyZWFkcz0xIHNsaWNlZF90aHJlYWRzPTAgbnI9MCBkZWNpbWF0ZT0xIGludGVybGFj\n",
+              "ZWQ9MCBibHVyYXlfY29tcGF0PTAgY29uc3RyYWluZWRfaW50cmE9MCBiZnJhbWVzPTMgYl9weXJh\n",
+              "bWlkPTIgYl9hZGFwdD0xIGJfYmlhcz0wIGRpcmVjdD0xIHdlaWdodGI9MSBvcGVuX2dvcD0wIHdl\n",
+              "aWdodHA9MiBrZXlpbnQ9MjUwIGtleWludF9taW49MTAgc2NlbmVjdXQ9NDAgaW50cmFfcmVmcmVz\n",
+              "aD0wIHJjX2xvb2thaGVhZD00MCByYz1jcmYgbWJ0cmVlPTEgY3JmPTIzLjAgcWNvbXA9MC42MCBx\n",
+              "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAPQZYiE\n",
+              "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZZ6/h5MpYA5/oqv4s2qPbYpW3jfK6\n",
+              "zQ6q7WMrNj7Hy8jZzmBpfHCwAAO1W4riBNsrapcCk+5V1W0XkkFULR4Qe+H3uGA2HgNW0zFAAUgt\n",
+              "W4tdpXv2OEg0Vuy5W5l/xGRmEGKDyeXyrM0S6q/1EKbad0x2mcHseUqNmeOGLy1N3b376XZKZcPY\n",
+              "IXC5F2332tNMj8CwOQiXM9PiCLyCVfZ3rQSkKBTZErkpS5kXUyoJG3FdIqLjRFKEapbUjcW64HIo\n",
+              "BeIbtRyWV9FyZfcTakx2KW3eB4ZI//MDykSe8CRgN76uBEqZFXwO63wmUREhHOb5AdaLV3xyGl/I\n",
+              "RV70rU/3t9t1aq5mFD3hy1aLTAV2U7nG072dyX87F7NgCxZHT2kFxu44fxf6gqVzE3PEbGr5fx9x\n",
+              "7TKXtmY53VP8UaeCd2HJiZ/sd165SutTnfiWvaLuCnmmXGF0AGqbj9S19kgOhTubZIJBydTTqQOV\n",
+              "YRlxbgKn2nzvunv9+NDG0/2ikyyp73W15QClmjyt8dUeynoN8CwtEQ59DdrAPZe4ARZTwWAfsRXw\n",
+              "1vcZ6Gr1nCNWllQw5IyZyxQtXrfc5p4wjPvGaltciG7d3FG1SGk6HDsZy5i/PsnkjRXLUvGbzYp2\n",
+              "2gs7ZSGfSJbEifctcMGeSqhOOYORKy6f/9omoieCVEEkniBXwWZ/eImb3nxF7SFIaBjgG2j9w5ut\n",
+              "BY6zSuQ5zRCdajzJ1loNO0havI8mp5yViAeAlLKYCxeK0Lha1FskL67W1YsARZVZ5EkhqAYEeTNI\n",
+              "M38Og48OXmj6QBN7c1b9uDUTacYEXO88ZQ1gCIREIMnm2Fgkir8pN4gtSeQ12sfOVz5x5KX7sa95\n",
+              "L4LyFQPDrFZcDBr4PWLeEEv8yzk0cYHE97GmAlA6WQ0HlWsS42cnXefvTPXnx4vcq8pbEo/slAuH\n",
+              "IBsrJEN1+aMCc9FNxwUPVbZVaWVjwLY0qh+mNWEaiNGRmacDXrYWw0NjqMPiLiFHacY5oGELRgym\n",
+              "S2mSo6zhsD1wKQ3EUQtwrjKPiDYc/HCqhkVwoWKUdI8xTS60kn4f5UqB0L77Yevh/wt7AnvQKQAq\n",
+              "QAEEevggRl1uigbOBTtscnYRnAj0edW4QExAzdo+RwLWXTzW/l3cBWTrh3ORzZQlxJ8jQTvPLB+f\n",
+              "bLazJZWFQQDcWhuhQ3gYcP1ruNwIroINRIr8px0UOgAhnk6CllxMN6gA5S0YPhFVFKd3n0AAAC9f\n",
+              "vYgISQAAAltBmiRsQR/+tSqC8p1IAOZemTPutEfx0mzK8zG8tdIxonBsDpoLZ+NnIOp4qK6idP1s\n",
+              "vbGvZz/zHM86Bg3q0yx2atmtgoo/Trt3YRy3se4HTjou+tCi7oJt2d7A8vEhVDu33JNJx+WCOgP0\n",
+              "03nVdg9lBs15v/0w7qMc3zqqJXCOy/Whl9aRhcaeOEWcD7uK6mCV8a6MpDJ959xBRfv2i/qFOFbL\n",
+              "Grs58WiGJcq4MQJI+rVWuFN50oiqBgiunfUrRmdviPYpNN11V9pwcOJwssWfIE3agnor/RC7vfLY\n",
+              "YoXzaJjtWLEL92OOaHLZT0j555xfb4FZcoJee+RXovB9IaoDdYRusngtBXPMUvnO+g2Z5Qdo9P8q\n",
+              "Zb8ItBAeHT8IBZAD/Z2nEA6qbxqOBSBtQNW6ZFYLtCTIoP/bLjCDHgtZk3cf+N1CpXs15pUIYWDW\n",
+              "elZtlTkM4w4EJlLdjLZyQPAeaBx/qoLmKyTKAEhm0hU8EcTq00f6fwkWgz2J6GTGtL/vJXgC8u4o\n",
+              "nTnf+Ou7sVJGVaouXxrzx+yGVHEcp/eV4gaFA95rInngQAOZWbA3558nK61JBPZl3NjEv5B9r9pg\n",
+              "2+SYY3wBAUeu2fgAB2+yYGw82pkoJJKpzYWORs6i1vn3GEgUTcwlYsdJcraYC5SnGvqSZhX7KM72\n",
+              "uE1e9bkpvpVyG/mkACn5R4jwX3xc2utCjjZgM101rirIF/7VfDtmJsSTDes+UVhbSr3SeMSI9ixJ\n",
+              "+fVuFZ5bnQPoRIfPc+Erw+K99JiGN+HE98/eq4pPlMY9oCfVPSdNyOAAAAFfQZ5CeId/AUuqOi5D\n",
+              "jlKfxuJGZZ1+rVyomjOIykvxtsjsuCiGtElbraCSFWcn3aIYWLrF3fPovVLcOnroBkiRMsdf5yJA\n",
+              "F87MQuoKeTaGOrxojCCCS64RiHrqNsE+7mfRRUDuB4sAEHFQHxBorgTukPSvrdFr5QDq+BhZj/6H\n",
+              "KN+IutwFWKX3ZX9pO3sI8My78TgRY5AA6FEcT91WcvnMypB/OWXzK6M8fYuhVVWipAZigjVOYhcF\n",
+              "9i6GweQFX9AV9EUQOp2qFbkrT5jceBRFLX6j4JUQ781/UGTekv1fcpCmzlpNpp8GdSeWxRL4gasp\n",
+              "F5uO5KW63rlhYccBo1cFwIN8txHNnwyQNiP00XC0PWDRZfaWSxsACRWrISow71IyUfcL7JNhjTII\n",
+              "rwDYATS0xZ9ep8siFC3JTxg1eNaroYfeI4tbkRHok47Vk+CUOQPuagVBtFMOOcy2OUbw8AWlAAAA\n",
+              "ugGeYXRDfwHM79ghzBo9nMnzfQPPIuvorxBb6AC8F4fYGD/t93kNSKNSEuhUXq9FKGtxnCkxN880\n",
+              "BPb/uTbjLTQVyPNuYlGl/gTlyLcVA/cDoLrl5TvaR/AcSLFE7C/t3kLx0STNibmdAf4TsHWKSblH\n",
+              "VWB4X7oQHrrDdhwIivRgUZf7f63j2XaGB+cbp5aHCCwJoovY51YTqsZZTz70FlSnypPHQBNzif7h\n",
+              "uvZkXhtEzpu9rYMo3YECkgAAAXIBnmNqQ38BDchAitLfY16mYQAQlVmv7062W8KLpIS1/zhS50Ib\n",
+              "b3ERigmkZKZMPaCsAi+zsLcku/gHGHnVZpuCZMFs72gmyuL4JFo6VjWcr5FtBvzIgD26rBNvP73P\n",
+              "nJjl3JImmFHiKjNez/gG3zTuYyCACuJCEYXyuEmzCM13hdCPHKg5GZtso0Z1qk6T1k2oiqF/3RIn\n",
+              "kyjRWuxBlHHmJ46TXULiUY14G+RAGoXI+u/G6muNclld2bq+6Zztuy+5ynaDWNNjuN1Ag9KUIx2F\n",
+              "XwNdepmp52/rOvISNPbMJ0U26OvqplXi+qHTbg8MLpUSIGCY8w9FZ5woLAENgvgu9M79yGlL20e7\n",
+              "ypJ4RMBqHYDpEz6Z+SSjXD8LsJ7VKlwo22A5Yukp1vTp6HHA35nV+PXK09DuRWKKdQUzmXVihF51\n",
+              "/+bB0PEFdoNxGdbbM7WveaCJN8XI7JgQWvw2nPlHX8M5QyPGSJ2HEexumoFrABvRAAAB70GaaEmo\n",
+              "QWiZTAgj//61KoCPNGHq/MxnjqmxxQAEHvTwibmyMZGX3ES9Abh1tMR+/DjR+6dnqRr/VxCl6gEP\n",
+              "wJ/5EYCYfGaGmQYsLOeM3v2SZjdvqQBwrwKk5A/63kFm8fc3QCLe93Mldv3KWXHdFT7/mudSntDc\n",
+              "vJwStG4jgi5LKlWdSrVaAxOmElsF+zWNzaCIQ1dOiZqi3JKj64hOeq1XIWyGvRvh6OLKBpB4rL6W\n",
+              "ugf7H/IPbSQuF5jWV7zL5LhxWiTiI+kAZTUMfO2YOLzmhCUSN9GAmNzgY4D2awYB4V4QTDjI7kdQ\n",
+              "tL+3Pmfl1HVilu7nC9CzQSvWIosiwv4btyHTL7IPT2gusybyNfW8QO133L6KbDhhXSDWUtcIFCgn\n",
+              "QUm36C9hvgGjorpKYr5VnErpJX6fRJm76fFYs8/nt763alyqdcSrqaTOLaf/72Wkkmlwbq3nLOIw\n",
+              "ADFDkkAPwzaM811K11iK/3HaYRT3nEhjJQFk5v4WBXwIVLAZeKdtC8YoGN9K6isN142fOG3s6fm4\n",
+              "J1nMtOEZHIwep8In4slLmHh39qBzhGZO3igiVpgz7u+JMBeFkVHe72vduBjIy+1dqvxL/TPics3s\n",
+              "+alwfTMNQKave1qW+5Uj8jZQTjcLAtKvzoako9VMIOfQUQAAAQpBnoZFESw7/wC9ZU4P+UeGsidW\n",
+              "4n5tFkXmtxppYvKQ+WGj/x3AAdl6+9c9x7N2b/yJykTvVggfpMnFUWtxla4sr1ouwANom+Uf4IBJ\n",
+              "/zXPovndpGdy98nJbZxFU4rrWpr8aI4YmRX65+IGTn756CZWwXKY5DyMgKnDcCtk0HEuoHgdGhh7\n",
+              "1PG8+nue+pE9pBHqiBNWAjPd90qfMtABmMShLoXtUObqYbqXhJvVjjFhKdPS03IF24fu9Z0ax15V\n",
+              "DnkiLmgyOCvJmcdIX70L2ZEECd/hxrSq9JUVjC41OX0F/ayI6GtkPMUuZ2xWkMFo5rqOAo7v0Zlk\n",
+              "ke/79TjeY13FNiowqcbhMwfDuwAAATIBnqV0Q38BDXNpg2t4nJdhAA5ru/5Co2KbB/AnQt7fa959\n",
+              "0crOQgtTxL36jtVyKPmfuQMYuWbJ/7bYTEV8sEjceHvN6B0CSEZzVCjaPLzOQJZMQpQ4K4WKPlGc\n",
+              "lnEwYAC9Dsejj7Fbk2RyCFiJinyU2HOscjUR6fW2jRsAFpVq/PtZDVPvesPG3AqooVaKHp9Ex+Da\n",
+              "AH0OvccSugyDKsRBAEiYR8645aXxbFSzraQsELDsIIr6HRN8F3lUNVBvzNO3mxBhq4th/kgZSjjJ\n",
+              "JZrYmg3UfIUO/jn4xs2XQ9Pa7Uy5K3JhuIQwAOUKDmAMC0p6fgz2on4ceyEcfiCGDPZpPyL3391F\n",
+              "dXID0ctPQ1a+Hk7UcAc9gSDL8CZKz59YyO0ACPjfAKV3Y2dbTAKdWBsUU0EAAAFEAZ6nakN/AItk\n",
+              "aaqbMCcBE0iEIDnEBfRZN0neHQxaz5DPSzK0ZSL640q0AA5jkP0YAYAumNCN0MxJYpWFoQ9r43H0\n",
+              "i9SZLdv1UbgpG3aX6KESZW7AgdlevaBngH/w8xYsqWx5t90zzi7x9VyRYpIAD+XTrxvgBoFILNCs\n",
+              "gd+zDA9uvbAPlLMwG/qFltlwvLokMt344erv3a/C/ySOwZHFzpakInpJ7MQHkmKi1KHZB5KrfqwF\n",
+              "FnglZJwWbe7LtVojTdwQnAksziDNlEWCkMQQJwziY1KYtlXMNX8mZ3MtYR1KNf/CNin7/ys9ZQyx\n",
+              "4Zlk//H5KDc/8O2+JaxH20CAaAABxgSxo+yJal1LnRHYfOQ1TygNueW/rPAA37g/6fLS7mbYKz7k\n",
+              "dsiSiy1mAV7n/qq81UHJPShQSXK+E4Y5XKuXEWG4AAAB8UGarEmoQWyZTAgj//61KoAW7kO9JCjl\n",
+              "XSE6nAngAJVxWWFl/YDS0gZ32xjwUFed4hmI6rj18z16nS3Mz1iMmFblrtaE4zGXS046COODiIwH\n",
+              "QG5lRmcBExMKlnynQruQtA8n/NitzdP/ysLrucGyp5nKV+XyJURULfxk4kwNp0a5TFlJ1fusOOJm\n",
+              "y0hvsvEg+d4Jz3anvWT6M9n5A84CGucNifV+WlN9gI9gs3qSoCZdU/gglcFYM5u8YchzhQFyMKxn\n",
+              "kpfWK2LU7aaZHt6xLbqjuv74523K9/dtrrsFq/LySiv1P9Wk6/6d5RC72z4cyaUq6hMMn4IWWRo0\n",
+              "zJIM1/lSYsWxt5/M1Mkv00Rt8OZvmLxuFfd1BIVlANlpgZ39RYhqqzU6v1HwaW0EudelFBGhr5mf\n",
+              "GaDE05Z8ywp5rN4Qq4D4GNAGD/qgEjtaDDf4ZBAD/TAHBwxfNjm2nPAdbbbIuWSkkv8NK6EMlKqH\n",
+              "mOktd+CB3P6Szd1+HPnUsyQ3659r3XLnoi0cvM4usfW+BgxqT0mgHSgn/F6ajdTNM+a8xJQnT036\n",
+              "7195r0uF5vwi7PIviCQ2E4Vs4Wx80/8tBDEJS4qOY1YJ5aNV1OV82fB3HOimLHd2vU/d4Cv7OBh8\n",
+              "k3gNFcjeBGh+3lQcDCLZrG1mAAAA3kGeykUVLDv/AGVBMHxAlJYGEpFnv2bb0ADrwvVKxe7+SIJI\n",
+              "g0dPJdL0s9Hd2mGX7rpdIiUH9ZgtnBO+m3uPNae/YtN3u2p0kkCez2KiPNqgSoEcHM+ePgq7afkq\n",
+              "0HHTSZl/+QbjsyfbI/0lv1mLAJUd3u7VZPPHSdXK3vwLfAwOe3Nid72slU892DijWVvanzM1IzDQ\n",
+              "XfN6x6GH2qfaLrHePrJTJxXC/RSxcAol7x2JJ5OA8VjN8jXu0yKirBiYqgcdFf9odG8j4bRmE2wD\n",
+              "MG0SKuGrJfd91b6B7hbRUwAAAPYBnul0Q38Ahz7YAbwPIqnkAA5sEIcKo2/sVUP0LEeFOLjKjaet\n",
+              "5YFAjDbL5BIdGqWouG/H8ozoec2ZpUbIZu0ELtG5yXc/5opSZlnqbOpqdTQkLs6gr9dv5GbFvVjS\n",
+              "Os1j9FIMQsdc8pttosNtygWB8gLxr65El6umAZE5CVU9Mc8Xxg/tenmTduGK9Cd7qRDiu1sLYR2f\n",
+              "or3KBMo8ebz5q5EmWucvREbYSziQIIycIwJg9OG+aH+ZUEQbjbfHfaiX7yoxGJGP78aNOHP7GvC+\n",
+              "JwM6DxnSyowUBAqkW8ckgrhet8gYYrt8MIe1MPJQB6sv8hHuAXkAAAFWAZ7rakN/AI9XvmYGr0rf\n",
+              "QEvrPPTQWEAA5ru3wBCXPJiC8OaE25OBvVl2wRXqp61wQU4HxGJCAxkSOz+G3Yzvg36uCK8bPZTq\n",
+              "avaOG/H9WxjsuwAl/bIYJdnyD151CiUZ34aErVIixKJ53oKrLeHr3xLgxuH+y3w5uH5lQRsL0Pmp\n",
+              "0jQItTBkKwlPywxFk55pROuYZWi/h/N19QaFlF7WPobUElLlr+nCH+pVt1nW9/YwVGz/cO8zwmWe\n",
+              "Fb0OnFji7CYSsi9ScC3a50GjUP7IpaY5NAHv33V57bkO/BD6dnreymTbSmQdcj7PAJkvz610fMqn\n",
+              "mDGTMB31oxAIE5eWeH7mBZouSgmtxEamul7sYaTPe7mP6FqNCz0h6wLot/zAFwx9/D2+XB0x8mmS\n",
+              "b086o+gqkoYoHQeQm2Sb3MU1Bz0KHDGo9jCmsBmecxs3oNHV4KaIoLKAAAABrEGa8EmoQWyZTAgj\n",
+              "//61KoAcdmk2P6doyaR4wEHxsIcmssCD5f+3/v8PGtlbWZ+A0oGGFPTAdgmU2TFbrRxlmwUCouNe\n",
+              "8freV7blHDodFImzwP3saA3AZT6NUl7vDGH/tw5n9y8rP4XGnhEXBHK+6jIhoAYc6G1CDX0mqczJ\n",
+              "7tbei5I0YSkDjza4rJSbAF6cRoJQH3s2Q+ggBQR0BfH6N3QlPVwd9YFvP6++J+XrbNU56Pxu6Wey\n",
+              "51asar4AaARXHregTXL4xn/VNt8Ppk2xD3/1jXAVXdqMlS0tYGM/TtrcuTC63Lx21RQtklG6k0xA\n",
+              "eWm6W0oL0KTvxuyegpC2ySp5v6zpSEYvzWR4IYirfT0RYU+jLtX0t4M/L/0k8xOLTHbouoUPD6DN\n",
+              "dYYLYlVX5noJzjCAVCiS21OCcIKqWD/YiU/+dTZpdFFNdHEa/MPvUEq7cJD7ANJ0YUweepq2Eqdh\n",
+              "57SC4Tpg6jyEnFgMaHQLSz1nJNh4lxM1TPouGZ9bmQdDr9WY+nwzRBa+ZLnaqBSYKWSKEs/TNtNZ\n",
+              "ev7d+EnJUf9G9CAmmiSDlRAvAAAAz0GfDkUVLDv/AGU2nAwHHyQlvUxuENDSO8vXFIAPilnMlQWb\n",
+              "nTHwb8wkIo6JKOaIP9blrrNXcWeeQDVprB1Bn//+nbSDHls1apJcUyMHUmojA58P91gutTiF40zp\n",
+              "fDaF096G01gcvpH5Za4+DfUvxQpt/wH5PntJzggww1tLhP1NyH5U2TTgrnA/BevK2aCa9xCuCVgA\n",
+              "JJZF4uqHE//COeWbJ6LIFJPoadxAxbrAcxPQQHMzEG5G5S3Yfd+YJBLrdO35JvVrsUTYO4AfvJeC\n",
+              "zwAAAe8Bny10Q38Aj03WPPyvISnWAC7KM5WfLH925SBeAKcvJaYOa5WZCzX9H5nU/7qAFTCgAnl3\n",
+              "rAoSnKk1337XDAnLfPYAAOSIcqQwF++e4HouwNVAWCEsVyl7Y6DnBaBT2mD1H8560KoMvm3kKNNC\n",
+              "oxFCc4BdAIXk45JUbGFNGYAjCbBbJInMjwa41HA404yKnJG7rNXdBctnsSL/36UoXvVx3J2tGX84\n",
+              "+FHk7e72CsAyB49ajd62idmFQji9Jj1GaiqtCIjWs5o6Mz8s5QfrvipNYYD0YZ7gBBGm4AEz17d8\n",
+              "isscgsp4QI2odbuEJDq1nfJbW6+1HGcN1XfDC1Xfa5IptM5UYHm5zIT4rSPBIDE6l8/NhVxlFP21\n",
+              "JPQ0DZxnZFvxIBznQbqkhaGZjMafgFoRzC9Nl17x+K6e75RlplRZtXaUIbjAUFBJIQPkoIrT6/O9\n",
+              "NtkAmnl8qqUC1RktW/RjiJqOyRTTITHqNKvKy/0gb88xEvvGPgzcSs2KpkbHJWmCGIlSWEkuqcCE\n",
+              "jBn3Y8XOQxMUxEYeLPJ/9s/F2fT5NAnko+RFlv75fWLekZZP2s17yJ5ccFGhZyrkGX6u7xXK7N8G\n",
+              "Qlz8qfOHvgMQrlB8p4j7qtnPgBPf8mcsM295CuAZxkK+sut074W+0hM24VMAAADaAZ8vakN/AI9G\n",
+              "UrhSy/Rrhc/LGXguupji5cAHC2DVoxU1gWUkKeMT366GcmuxH5O8lBZJeHl8r2KNT0EaVARyW7pN\n",
+              "L4uNsKKl/WAzLJ1OZWTQf4NaAfodQGO9KzZS0j6oGvr/urKiQwbP44Tv//glYQyyCFeq+8nnrHBj\n",
+              "aACu2w1otySh0DYMX412uY6EYcx3GtQaRpNPiKQniWdVV2KH48fVxDy0uLS0SmCZEAWLVNvtWqO+\n",
+              "q2OwCBr1m50s0i8eRTlSP9xoKtxWC4ZqL77eAW3kYEBJOAywYUAAAAH6QZs0SahBbJlMCCP//rUq\n",
+              "gBY3NzYDjVIwwAKbp/vtZn3NtK6t0V/4sA0MV4ijJVoTZ+e36T0E9eQ0LOyzsqR0ULZJUDRy41oM\n",
+              "RdsBwM4wyEJC67daWmuDEXKhZo862uqAH8A0QJ5u5RKBPFpngChYYJdWzP3onEWImG8Yryy/SXt0\n",
+              "jQ5te76AagLius72bzwZ4AZfLm/04ID6oXhPwqkf1cNsu4/kIt7oCOETiL+lzwHLEnEsdPSz3DxD\n",
+              "uLGkH8o6jHofDxEXcB6cOS43aUxGKPYPtHCj2gw6RzcRoX5lD5mwqtoCTxk6N8TxyipSUyNnbA2b\n",
+              "G5NuBUVLHTce3QKY3SdkbyH/wzdOpT3YHUE+FYQwMKCF6SMyMBxp2gI9k4yUZYljUiekF2XIFkfv\n",
+              "TFy1RUmikOycLKkTYTreTarsMD5JfjZ2FJWrroj/YX+uNeGtKNZl9Zyt+k8u4Htq1bPYEjCrLHds\n",
+              "qeIuFWmvxTYEQblStjDXmWfITtxy8KvOgn9iV+KlidrnVhlE7Dz30fuHXxxFZvIzhgU9uv6sSC7T\n",
+              "vZuGMsKGBGTYmSe0P9hLI2VyM/8GUWwG/AITiU4a7OVDjUNRPaiIEt8jt2oImPIY8qcrJ82CVd+P\n",
+              "mSjoppoeHUTHmeo+koGqjhwT7ueVHNT5VZ4yuGKEDdFfEIkAAAEMQZ9SRRUsO/8AYrbCELHs5dcg\n",
+              "AyOPuRHZUWtdXLx9XaNQixO/8Cc4Q2MgEa/wKETsHiR8C1XOv7rI3JB0rg46JfjEArbHaTHmANKo\n",
+              "+czcI/sIduYNFOE3TvObMh/KtGpZSdF+qnDDtY8zD+7RQUdzmkG5zeDj3u4Vq+f3qnKCwgbU+U0R\n",
+              "dQR9Q60wXqL03p/iYVxkI8jJqvkECuxT7efJI+5rmzyP1yn+WKY2EsjjB7bwwVfe6RxBmzR9Ed/9\n",
+              "CA95ILUJxNg4HsmCO2Ko+MqZAH3wMlG18kUm2ogL3cKIkVXogjofyKhbsSpKLpFFk71DzB6NrY/3\n",
+              "HfknWM2yn9yeQB/joufGEf/bvMAS8QAAAN4Bn3F0Q38Ado97WJWiqN4XS53kTA5YWsnJBdebpf+9\n",
+              "lcN5zPySAC6fH/XzBsBKbxdm4pTiPFVrmGXyhaRiB6dxtlwj8MyI40Do8AXHq41BAunk4K4PTgzR\n",
+              "rFycWqaL549wB2C5jNCLXlq6Tuytik3ijlMSkx9noeIG2Lc83eWkRkQieksQSO4xI1tzzkdqaNhG\n",
+              "ExZARu3MauZwrBopslb/ZLdR5ZS0G6p8o9DD5cphJjxJoSV/70/0Gr+woS8Zj0JpVvvpygE5bXQp\n",
+              "/YBCqjmq4uOCyt9SvCzPelUEwXEAAAGyAZ9zakN/AHZ6+HiwE6fxvgA5rqP9zmI+FShvhJS43N4N\n",
+              "sc5a7qq0DK7DHadXkQxf+APmeqLrIGM9X5aCQgeyxdoAlcQoyNsm6ol85w5z6JV8A3YntmCae+s8\n",
+              "+8/Yheg1ctJWrSharoeypUyemQeq9Rm5cIkSOS9Ej0hbIHyFhPQW6K3SawgMNVKQ0s1BpJvXDQSY\n",
+              "x3jIEdIgEtwe7zce/DjcO3RNN3g+SlPoM7cl0qJbM44NIDG9JGXcwVrY/YKNrpChX0yegP2ZHDI1\n",
+              "MzOs5eWP/2l5loJrLid2mK4Qhw6EGFrIadsV8rSjzgHRNuzJ4U3JdubidEobU0ehkU0P6MYRK/XM\n",
+              "58mVywGbsw6LPu56h1S4w3zHGYMd1zPKOsnCUhaRfrSZTxvjerNQ22prVPqBstk4JgHdnSScrwGw\n",
+              "eQcqvIw7gKhonPDKM4fJtO4n2EsI5Cd0iGMjmgPw/PU3FL8ZP3QbYLMwZ81Wd7BLLBDf+ngKiFIe\n",
+              "it4neyhhaE/a71b8TxeM/ZrgH9+D76dlgPI1ZJW6CCVyIs6Y5gK2plkcgRYa0MwWF+1A6zPtBEgA\n",
+              "LOAAAAIIQZt4SahBbJlMCCP//rUqgBY9we30eRuAA2kMf/9/gX2SHKs8Uq31+W7Vx4LugxILnhMT\n",
+              "6icG5WQzdpL8yjIXjBq99nVaYweUdJE3LrdOpsVxNJ3kODVBkposYOoRuOMi/SNhcjrJwShp6ljG\n",
+              "Qs7tSeRJSYDkvm+SI2ckjbManbEesw6wo2ZffuryaLuWkU9SNALC+2QbPJD4bFy7sTmB9+6VOdMm\n",
+              "rnLvYN4ZyAJz7OhQG85P+JnxdgXgvSv66sWBs05p3vOE+53H+HQCMTLVgvoYmHNTIYtZ5CIln4hA\n",
+              "GrjLg53unVVQTiYlSzZrRE2vmtsqac+v6CrcbtgC4HktflvPTsvgqWNHri9NWa+EuXgx/AgGkZVJ\n",
+              "r1n6gAd3jtjLtv6YvbPiBBo2AhBUxCbYyroAjcvjwUBtRjXTdDEvdYfItmTKA7W3+KvVi/PCtod6\n",
+              "/3gOoaA7zRdO+8+MHlGl/c2xzQhj2O1n8eJkOu+NcsBkpmxyosDi11EOEaiQ6vfnOvH9MSM+7D/v\n",
+              "k91SLlwv/nF+5eDPHSLZQIoFUjHjwVoSGCdOLqmIe6tsfTERCeAhC+1bhRhe0612KIL6izjolsR2\n",
+              "nUgrl1o39HqnKAVqQ/HguEezLTgmGW27Df2kp4E1wRl/EQgEcsMfBPga1ndY4uHPYq84ArNCWk+c\n",
+              "YwxlHAPVC3PK3Zp2kQAAAWFBn5ZFFSw7/wBXFVHDEfqz5TAg6AmqzzGCl9B1ICKhB+tKz4Y9Km1L\n",
+              "/vZyZ1OR5rO815FlrTgGoncUDKVNjpKrVerCm+HleHb1b4FhYQG8B61zGq10uLuoQHIyL4Cv2/mm\n",
+              "s5Mi7ZftErBt64oWYphUyh0Hmn9dYYheGFzLdE9gvqcAEGJDyLZq+nfiK0Px8pHIgaIfsEdSUYcC\n",
+              "8Otyxta0EKY+Dm2m8AtQ8jjuDmkSHm/uLhgf1uCnztOKFhkR+ydRCeR9tnIlTfiv3gJbsPT8swjP\n",
+              "0OUm6yT8LhwwCJU0AGI9hN0/kTkz+NeSHjSPaBx26MAfS2Y5NEtva844h4B/RttjqxMsNDiDrfB4\n",
+              "5xn/Cl/3XrcF40eivyUSC+FHzx3M4BoLQLOKf7iz8hKiUrqRGVkGToUMxkr5192x9xCjbuvLRMd8\n",
+              "9Pel4WIOhSi52xuSf1eEhC5VVAp4lHpZmHCbgAAAAaABn7V0Q38AdnTaV3jxqK844c19uepGJJSA\n",
+              "C7DQuTz6pWfCzxcMbX5JwHItpyM9y3YT46z61a7h5Lyukp+nSKoO0zQhT0EB/u6ILUCNvVbb/89X\n",
+              "7TVI5UN6EFwYYfi4uoFmqb+5Cd0J/+d2405yTsK/f6WH/T+vNB1DYWrW67ctgHOgMHAWDLG9mitl\n",
+              "16bXmPVSi2sWzpWYg3147nlnaD00aZHqQlrMPzYTLLFwWHOLNqCoWpNLMMEevc8AnQWeykk9VNTU\n",
+              "NXzAXhrKDXl1tLQTxZG7GX3K9cQyeUnjfH3rMBGDD2zCLGXrMfPVl9EJ/F5M49Rjn38sXUf2JvF8\n",
+              "D9r9tV1APCHN27+egfFIMDg9OhrQMtjAe3WEfpYS7pl5yHh7ZZ2CedEo/Wf/ygYTAQFI72AaUTrV\n",
+              "n47d9OSqAdYs7lkgV0864auRyPQeTKK1Sp3ADeIFS134VGBNG1VnrfyZuznYkI2r0FVkGFrAXpUu\n",
+              "ZJmyKqqILhJ1OTBM8C0VBV2QXBYa2aSn2jj9t40/wJJWc9IGAVR0vj/u+wFocjwf4QAAAZYBn7dq\n",
+              "Q38AeUc/pR5QUuADgu7/kKjYlIf8yn+MfKKvFMJ4eRJz/DRqteBIBJsZW3T3phi3NzuSw0zOvEhr\n",
+              "CHz7xEUteyaR+fa6YCBeiCtangbUerW/UGoCobzV/74XB/lXH53NcEw+6x9o3/ZgwG/7l4psK3P0\n",
+              "EqSwtCrcKAAv8Wi0Z88mFp3Sp19shMF41mqYa8pNsyefrruQONS60LHg/1GySbrTeTWW74lCDwnt\n",
+              "BGXpwghp/QF087PP7hxkE8lvu8APh5F1FTiOCBSvJFm6yFC/tz24gmveLoV4Rq/qtYWRE09VDCDH\n",
+              "yjftToPMsyi4DoCtXsPRk5Jxr9Mn6xDxGjfz8uMmOKJ15ejPi/Sx9cR1QrBsU9dhcYifdB+c0AMF\n",
+              "PolB3N4pBZAASP6m7EzaTer6yZ2sIKcQdlGt9xsZ0SHtS2313gpdJkLEVrHpO5/BTcfUTTcK1+bC\n",
+              "PwRYX+iIyInP1m6htprdy84ySZ5IaGCpRKFxMCf5w22wXyyon+dlMPKACguyEPTCCZQ2MqEuC+sa\n",
+              "uB/hAAABxUGbvEmoQWyZTAgj//61KoAXgR9s4tVmwJ9HTza3s57iAAoQf/wjqzjlXnP+29f12EfR\n",
+              "S7B+4I2epG2qM/uoQ7VlrfXFlhjyX/aTq0n55QXAKa2xUKolKsuMfmZFFc6+GP96b13JiSidvPgt\n",
+              "2SSGnq9Yw4MfceFmgOaZRcwoMnpdb0UpI73YdP+DfypKyrkDqKWcBc/BGhrH8+XdnpCNDXfg5rMl\n",
+              "b0uFlQ11yUxnDYOfRwLbdjJA6FYddawSEVorFtY7jkSQx+OUBUgWkKC9rhKB+uV/yqQsvbuFiyYV\n",
+              "MviBpsZgSSN0TOC5JedQ5H38ENVBLjXnWZD9PQyueLoT4qwtI+7lodFSnBG3zboWdj6P7XDbgKT/\n",
+              "zKkFObUjwhstiQtohzxd5AXhBH3DQqNv6mRzuMxFDcTEo5ut/0/1HrPGOF4R3sJ/eQT+YnYseqvc\n",
+              "0m5njpgI3qkLmn8efBB4q3zWGpHCxBwC84HKjuugMICuXfcJHKn0aWkn65aEjT8AdxDWE09InGyo\n",
+              "EM1wsU0JgJ/qq/6MdHWfQW6+bt5xWlpYJ4axi9wZc3Aoz+Rixn8UVM2e/bd31+W37ucz9udquxnL\n",
+              "2JdNUAAAARlBn9pFFSw7/wBZVXkLa/7xg9HEtDOpc+GkSv0gCD3x6eQNkROUaCyL6QH8m/0USPLW\n",
+              "nllgC+uXg2X8kUpaUiErsLvwKd9y+trtKwV7xlvkAn0JqEnToCvptE1Sb8eF86DTi2ywy7WE/imn\n",
+              "jNBYQny1cV38ScnZp/V3phWQAYBG3kUdNNuj/FyVB7DgbQbTLK48AO5nLYv8B3LvBNBfBJ+ym1yg\n",
+              "YJXKwjm8kt8xUjO2UGKeggZOs7YHWr5Fj8OX4jV/B3/cMzP+f6YyrayA/80F6f9vgrbTlhWdlFQ8\n",
+              "QtrHKjmrl874OSSPJYH5wfQfF/1NrQd6soxjmSWYI9/FqOPoy6ujUPxQvg1fUda+wK31Cv8gD96H\n",
+              "LPqpgQAAAXkBn/l0Q38AeBaU9hYCjxV6lA176iBcJKIHTfhwkqkAB+a0LmdvcgdK3vyEsSkCI+8U\n",
+              "up3OQ4OQId/B45+Mf5P4Fc2VsfnQAACxyzNkvgEEYwZk+TyOR6/VZmeFNYMrBdqc2NNBlh56ISK/\n",
+              "h5V9lagvsX7yv0p9Hk6RXo3uoMgKhKOv/QgBAqhUvAKDw4DS7G31tehd/myRMmCPxIJ79bZsQe2/\n",
+              "iq7Nquzc/VDpPXFZHPvOmiyfyrt6Fxc2jLHZJGpvacPTIeLJiSaBxgRTEKBr/xXaKQjc5nLhlwgc\n",
+              "HSz1WRlyOsXOkob3rY8KoGVETaaIvHEl7sVHsV3QN7iR2rIGzf6YHv+c3l8OW1b7tAMShtcCLifl\n",
+              "8k1OtS8Z5o7MNTObuLXIONSPGo1fC97qRzqHFEfMZntEMqsFjjWPM6JduvRiAv8p/h0kRdcTeRox\n",
+              "t4PEdFJikYgCJgtFa00LDpNvd6Vv6MImiivCAgL9L7zEaNCr8p/p5ZiDugAAAO8Bn/tqQ38AfAnX\n",
+              "r+Rl0wYAC9kEZglKr0YEZPxbFiynbDVLyUoB5/4mwbggJCKqWcWLXkOc702XkfuMANGy7OD7QUCV\n",
+              "nopFHkp77AuzGvvM2JQndhYVkdbX30/kmHQDID1DcpthKQBbzUjm7wgAOqbulxKDc1OUw1plN1OA\n",
+              "iXs8Ju+zQDtZelKPfekDEF5iPA8IQMn3LLocZ168PVHW73hdmgfMFTsqduJxZ1oiezDuUBPUKdNQ\n",
+              "1lGg5KUsS5A9iNuo+n1shJKCmk20FfXGeNEywAjYeaq4bao/dd8nZn//htlIayY083IymAgdHbKW\n",
+              "UQAAAW1Bm/5JqEFsmUwUTBH//rUqgBbB5O6qXkABRezeefAxp9PjwxeDBuTTFSUNk2voPSz0T3Lj\n",
+              "1K/LmQtEI6YkskJKgxvIXHGf8LHTV/h2Mg/qV3IQ4zvBygOQs98iZyR5jgV+hQ58R6xIcus/6y5a\n",
+              "HrkViRrv8Sk7So3LYWmfkLzyR6vcCKhF/sCJsY8RS8BK5OOGU2Ll4Qs1n4jPQwTLDELf8SF2+07z\n",
+              "zB5hexERnOHmWZ9THKXS8j6NXPrj2p32k0gvmlI4b/Of9evEX9mDBp5GtQHOvTswQ/VYUajAUXz4\n",
+              "5w6EHuB/k+FBz9pe+B69syJ2X5MYn7Qi9rKpCl2kZv4uAWXuNo7oIaU7hr6elcFz53tdL9AEjCAb\n",
+              "BlT3p448134hjvo9lj95CHF5teK1w+R310Gc3NQ0eeJcsiYD2EoVrHHjVDF/m8I8JtTUFdJ3xm+G\n",
+              "muADOcIpcqYbeqyKWwHmgvRze+DMQbkLo4AlgQAAAR4Bnh1qQ38AfBSmnoPKZzTuFWeZOcrkeWeU\n",
+              "yVIALsozlefbqRZf6f7w7fkPoFSkdlxkJJsnO6qzfbc/Kotbm2yeFrIQw5yspszQL8gAAvMHKSnw\n",
+              "f4CTQ2vfLY55MADj1baDD7LZtn0UK1Eh1HnwXobc+mdHd/JEl/a2Tszf/EZ9+J7oMl+BYsjWKwNY\n",
+              "vOv5flnnPLcex/hWFIF4n+hpBybvasl5hI9mV0CeAAyAclftj8N9n7hadcpM/TOVmHbSkJ3cr/k+\n",
+              "StSwI8gY9k3tmbMSZc42caMpFr6YdNCCIj52zmNBccPNFxW+UT/4qCqtX1gc2j7obKDaWzC1yj1A\n",
+              "td8/VAjqVn+FzuuEokhhvubRT3RCdxeWnBTCG0CxwC7gAAACMkGaAknhClJlMCCP//61KoAXgkIw\n",
+              "VJpvAgAqN7f+5rJJcY8tkjj7p4LozjswOy2dTydK33mOBGS+NojRzBOlwt3ro+/vdQIUTIVrXKwh\n",
+              "2SrHPCPJXQoCjJUPkRODCmqbZeBHsv1r7iIOZPpX66HYYhWgPLvPzAb/Nqu9nQqKoyphhNy32+S5\n",
+              "qAFvjRKLSjPAx7GoKGUNMbYduhsBsrvVTwhrV8uWAls2mxYggJzVuRUZSL9cSt+tjl44BXjlbo1a\n",
+              "I7ybNHG97GCzcbSNcg0RA+iqwDsdnrZCO0zsNdWK1qVmER0PsSf0dicSrZwIcxZWy6JbkwQn5TnO\n",
+              "kAah3wAs6pJvW+a5ZiJHl6sVlU3yCOlrECAESqWu0YR75WfiMXgesBOuXGGNsC3icmPYNzM93us1\n",
+              "7GQTI6RmmFHGo+B2yAB2YJiK1YN/T0ltUuXfFAvL4UdHgEXOVIqVj+S+YpITMKy740IvYQ5zuZPD\n",
+              "ahdXF7HIU7xE0W12w+6qkuyZwxUMXLXdgx6svudMor1GNfDCdymcKIidhuuXh7vdQrgbivH7usVC\n",
+              "zjMqgjGahkW1YlmytCooEIoULx5ux9DK360iAi4u/nAomESdiosanRfQ9jQdJSpo4rurLfeCLF1Z\n",
+              "XsQAQRTcezHlxp1tz3A3WsYMA9urPBB8pUlDdB63MfZDCBphVx/Ddv1AMvPXFEPu18oREsV3BdKx\n",
+              "e3lxLWWpytzF3zXttYGgBb90j9DgRGE1uaAWyEAAAAEiQZ4gRTRMO/8AWVV6uU/hFqUNYqrP23yu\n",
+              "FpB+ECoAQNVnJ92i7ZF1i7u1D6K4L4gxm2RaiGsRDmf2iYWEjO8yGHAqwpcDep1/+H221WMh98AE\n",
+              "VV9Ferf+hy0D7Zu5rX4Hp3s1TpcNcEBIKPHVSHIzaZKKfPXkqE/ga/eepp8Bzdc39OW6g91hVVvf\n",
+              "WJxrnf77rapWbmivuJFfeO9u+RRykk/agdEi5E/5a475KGQprA2yl390PNrCvoamPyXbETwtbYAQ\n",
+              "pF9uDZkHdN/NQ1P4rz+zQLJx21eQsP9WBLswpDFYg9BjPw+3VrVEzeid2j5wJBlq+56Hw+Ex6fI6\n",
+              "1O0GbWSAC5/5Zg+kGX0Yx7/We9PseMWGwXWIVwqI7oHPEnK6wUkAAADgAZ5fdEN/AHk02mburIzA\n",
+              "1V5U+8CauxZABexQ9zxvy3GIkNn2+19EyZqnRm0DMMsXP4ZwiY8vW/qdBTlATfbmIFDxCTzt76+L\n",
+              "X3WaNfG+rqTfzj6gLFFHl5IJDtQmIC9KAmTgQM0Lp8TEDdYJnPYGFybq0Xdyl74+130DteV0SYTD\n",
+              "hgB6230zJvCx8ZW04pZHmYvtJ1LZAxF3BAWKPXcstkh7/Er8zYdPblR7K6t0r3b/sIHpME53VRBk\n",
+              "ggj1uN/p+iN4KwToxjP8kZ1opB7xpkyOQpicygiGnwjU7EpZpywAAAF2AZ5BakN/AIdka2Wer/IA\n",
+              "EJVZr+9KNmiS7zXHA/5uJU6D0CbJOrsLPWcfwAUCZZjhlCsnAlgzrrGOONmuxU3En1TfTKb/7Pu5\n",
+              "1R8PfIYkV/dZFitvMyRPMvzwXX1OcxtjbhM+M0LCh6zNEWJFi2Pi95t8cspIknD4iXNUblA3oEFp\n",
+              "VGuXt+8S3Upf64YqAxWADhb5zxXL+O/gnWiyawM9fyRrYcExecMkEiv5MHRsJs8Euzdps1vwxzNA\n",
+              "Zu4bu6ic2K2ueNja78qXGaHz7xLoPIVJv/T4KAuseyOhznfFtKf0Ey0eSBVK9qutGGF83lfe5Wtv\n",
+              "xb73lHTKLAyiyJassoDHBSQLAcUPb4nB6xWNr9G9gWtqEIp4Or9tKJzZIZ1tnIKZFZGb0ELAlV2+\n",
+              "pKKDz5nW+syHi871Soc3HtgomT3Y1cp83yQG1GdKkcJPkU1uJVzsVPzbXbSU7/z2Q7cikc4seN2D\n",
+              "ryQ1l58HjUs0ikCXV/V/CDkAAAH6QZpGSahBaJlMCCP//rUqgBbmS0XBN5gNQAaCJTjyhVwVkMwl\n",
+              "GF6KXnd0XUyzqjFCJEv0D2xQiJu8if6sKo6qHl+BP/MZw8ss5OKq407INzCjWOsjf2HTKyC5fNLK\n",
+              "wiJv+PzieOozn64ZK7RRud2QUaDe0kuhk4uCClSYQBImrxmWeEf/X9zH3+ilYhfoZigVm0IoMiuu\n",
+              "YX1ERVdg0Ld9E6wxbYMiQAGJU1qeeTwc8vb3w3kiJheTA2PNXtrJ98RwtpnhN6QxMe1dw+aQWI7S\n",
+              "j0oQ9iNx73N93RuNVRxXj/57S9VltjA0RTZBjLvYS81QDA3fBgaNHNzOBZ7dztz/rTxxOpumjTTw\n",
+              "x9FgnvlMsjx7FYPKUcXD5quVKd8lwTlOiGVI7X1HEv3Hh4EvpYVt6azhUBI1qGunVb3X1lyMhWJ9\n",
+              "p3muqcicwInEt+BuHY92HoNXaaJJbbQmNX5s3QJbI28Pg4gc2gaUF4SQRcBgM8uwcYUzxEkBS06L\n",
+              "0moZm8bwMsLYCLj3fgXOyFudpfg6jkYPDeVK811WbzEz8Hcd42XVL0EwE3bwDc+i2I4+NERo6J6l\n",
+              "d4d7nOIvqUuorZnDPtlYcfSWgBqdP0tQHvFb4Sv9QUCBvXlH2IEiNzo/daaHVtbFRNZ3cag2HOiP\n",
+              "lMxyt8xYJMnG7di2JiwAAAD7QZ5kRREsO/8AVwwP3fRRACC0tQoY45xe6yfL8KMHlR1wbd4HcPUC\n",
+              "+4PcnqOzdoNv80ufRyOopFYryJahX+qWFUVKK+nDtdvegTv/PqvENcT8ykEwwQ7z2oNUdaMITYi5\n",
+              "4tC5YA9FaLSBorMGx3aocAbiF8065MBqyaTkiW7FtGRHVSPubGixAl7hiQRoBoEipfCxkE/EBoII\n",
+              "omSCNrFRyjd8oY66cDfZt+iBI44uLDeP6eHMEpBALsV0FY7iWjBLaYO1t2PsklOb93SAExoyIX1I\n",
+              "TiPXiUgrCYe7dgepAF31BCnOuxiIAPWKLDHZLhGOJBLqdemk1EZoKCEAAAE5AZ6DdEN/AIteG4cJ\n",
+              "hGXgWAAHNd3/IaNiUh/zKhTXYgf+UKkbUvWJoLo7whMXByWkvy3MotNcPaSHeaKS5vKy/hBJIgk5\n",
+              "CWcdsbd5QzFHyjOIZiaEAA1AziqRPTDRRVYKhcrm181rAlAdaYmvKZAOu92pmI39/PSQjhiMouSe\n",
+              "XVT3pg0s+/zN7WMQCHqTmey2TTctwD0YnAH9CK4EMAw1jPCCTXgop9epuL/iXjup2S+LS3pGE3iO\n",
+              "oIHon+1ERGRC2Vp3b2QAstSXzK/2zI+bVnxf0PhgKqa/NeuEaF2SBGZ/TyqGPDnQfJRorCp1s+mw\n",
+              "tm/3aVbjKRTXeSwl+OCfF6rMqjf/Zw8/4yrjLNmiyOgD8OWqATkM50NFqOShrrTCaHdcxgVW70ss\n",
+              "cCXKxvzAUCe+4nK4C3zP8QAAAWMBnoVqQ38Ai2Rc7ISR6q0L0pberS7nbElvP1eAuajd6ehFPCEk\n",
+              "va4007gA4DkP0YAYAumNCN0kma3A2DvFPa+NTDmrilkXNhiNVTFRLzynsy8rdgQPBH6k5DFr/4eZ\n",
+              "jmJjfYPWB5+2eEYYc9uJ5Ni70hsVFfV+T8zp+ZkLZnd2wv7AZ7A8baF9R5O9oQlCkoVPxkDHTrmt\n",
+              "rElQhX8Fi0yj2+BVP5O9UNPGQU0+M3KYUTg9yTBG2cCw6Drt49/5M/86NN03F5R9JS9KGOfJjIlA\n",
+              "koCavGpTFqq7OYU0RM3ilfXBmxvL5QoIK28Uvs71J3h/IvKmg4v/14n3/eoSpqNUCC77ty2SgAAi\n",
+              "rxQNIHz2GF/lpTynlwsORrYNT1lJMVud8AAQb+/SaHWQXmhJ+8cZTt8XuMgG/t/hdF6GqyG0A/Pn\n",
+              "hWRq+asN+zBaeyQUWZrjl8ry0h3WPkAZksFb/gV7ABWxAAAB/0GaikmoQWyZTAgj//61KoAWw9mB\n",
+              "34Nmlq4DQoTYIkneVdOFHxDDrFwsv7yxZXXwNkGuLMduj7QGT/7lr2bNfzApMJfo9/ffM5g789Cz\n",
+              "1Mn0zxePHMHBL6IHHRVXWyqDMhVLYnQ9xFtc1jml18If/8STBCOf+AZjMnARcFmX1IwLt/ziVSoN\n",
+              "e4GPKKZqfZWytoW7461OuaeZ9dvtxrCL+W45zobgR5vOrVM+Opl+w/eFlupHlgpQBWgJcPy8sZC4\n",
+              "/O9laiYA63xx6M701UUvGFsRI+RM6anXyjKc7TVrmZ/YQKRjqB6Mejs2G1mTDkBn7T2ZURI2vZ3u\n",
+              "VXRNsQnGYDxRUokS3YRHs9LEF/gxKSdLEEiHDqcoIHyS2FPM+cIJRSvB7sxIA3hgfN/O4qDK6VO+\n",
+              "t71oi1H0Bkz1ugONnVTpQr+WeMS5AtXXNBMXU+ycO0+R9eRe9BwSk0V6tHm/HJ45oIYvyWTj3yZa\n",
+              "JQ6q+o4isbf26PsTbuSAcvQoMnzEXJkqElGJ8Z3rZtdkIzQW0DDnXeNRbj2wQmuUNBknMsWOw2/t\n",
+              "fD8BErzYLXI65PwTY+6R5c6RWYzF9HNMLBaO1c6cI4yEu1DMKtZW5FrmVuc6hg7VnWxgAgOdFKFA\n",
+              "QvmmcrbHsqCH4rkez1y5GoMlxeOuW5WKa/JdcefAflYgakEAAAEQQZ6oRRUsO/8AZUEtmg0dqwLy\n",
+              "ubLYtABfXw0ri+bvSnwBqWW9hB3/jYP94x5LyZNY560IvuBe5T4EX3/71Gbqj7BS5SJLQ7X1JK0z\n",
+              "I9iR6McwRU2BDEhu+2JQm1RA2fBVxnzCyNr1JVnfyyuumlkNzE8n1UgnkIbS/FMxc8DghB7zqZzK\n",
+              "rkagW0hHwSjNf+LJf3DnbXyvnzmB1lcv8Z9QlsnPKDef2giSgbZeTNWRMfeu91kckRy0SSKkaYVK\n",
+              "KUUpf450Vl2TzPLRaNhk7Du1IJzIJRf9supxssXD9v31LAVibgyznyLU/cS57Vr8KEXG+WpKysV+\n",
+              "6iQmQ/hCoRg82drzuniAPltxm8MMUZwVMGAAAAEzAZ7HdEN/AHUKF3WsfCAA7NAZyuGlRySXJzA8\n",
+              "WtPYIqCp+udF6BaVoG3w794kSqeP3syNbVlr+uFhruNMOOzTsNGrbATFZMl9DU6mhIXZ1HEAskmI\n",
+              "VVSgXlz4sVX35JqYrDPP8r9Bsg/O9tAp7LnTMjWlqOdgOPhHpyqf/hmokPsCwqtKfsDhxP/tmX60\n",
+              "fhM4KsfvpygzK8jmUmY/GDBCISRQeW6U8uaq8guf+cvy+sP09JLJ4HsULhIsm6kyYO04HBdOFUDr\n",
+              "/8IzlOKX3w/FCxhimlJIduY8iySAFQmALOuag1Ry1Z3p7NpGIGhZp/q5hzsMAsH2jpHXQPdtFNFH\n",
+              "4VkqDlRDeGqieCr6gwu3hPQQfF9yauq4qf5R+bfPha9tZ3XjpRO4eqNaj2xEQrcb5cIJOAAAAUsB\n",
+              "nslqQ38Aj1e+ZhXsJE07lvgA5ryx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v+BMMbdxEWzwYvcd\n",
+              "d3NYalS7o/aUthPBRfYGmx2hUIQijLOXN4leC3SONeoCputIRor3Lgsy985K8UL4nvf1+pFmRQg0\n",
+              "eJgJ9ubt7jVqU4S6enDDZ82+hYwxDWOROomkxsOv8nlizRgAHHE1n42Dq5sLIu8oVYp/4M1h4rCy\n",
+              "m7AmDrR9dbHlpV6pqPLshIJSKr7R6XCF5H/mgt+78ttEoS2XxbrmVQj6DQtTzcYF1gqzE9DaiXTc\n",
+              "rKcf1aBAFclenBiNHhbAMEE20Br4FIkr51a0ynzJocMgaUhstOH+7gKJGCsTPkykOiVzQeIGOfi6\n",
+              "AmLkbzIds0NOnV21ExFbxIFAMu1BymG8Kjwvo1cLb7372R2f+Qt5Z8LjmGrBAAABxUGazkmoQWyZ\n",
+              "TAgj//61KoAWP/AeMmkxh4qDG8hcZFMZjYIY//v8PGtlbWZ+A0oGGFPTAdgmU2TFbrR0QmwUCouN\n",
+              "e8fq+V7LhZ4IhSGjAEZXRALCc6lvXQaVk4Hy29vGup69bTfpCSIWWGXFW7WfQjL50GRbZZRZHQ2m\n",
+              "pjAJ2N9/bloCCNQEfrVxCeDkKfJqKlRpIdnOUaiQpsnEysqkLqMfxaCLAtiv1vFXcLPLizzlMPs7\n",
+              "NIiiAuhD4+CMokPsODEut5yq6fM1zRym2P9iids6rfyvN0EtWlvUXkAIdmS8HfE5DlX5rtipWZ2i\n",
+              "d9rb+tQcwCfWN6erokI6tARQJu2c+ZSF/sI7qofDkfNVCHii2Msza0cnJEbLkEfdF+gBET2KrdRv\n",
+              "E5mgO+6ICEAI6O/h7r7DxvTQ9Wxzo3mHNo6898yojVZYUAEyiEUBn5+alz6XfA0d5GcOXFRjv906\n",
+              "SVSt5h/ZyjXd+HmcrubYPlDuxhjCrkqyrKcbhfJHp/Mq+DI065H9OXdNO/+uDSHvPcKkibqiAVhI\n",
+              "DqTA+NZM5+PbtXMsqU6iKpSzqr3AN5mBITP84n9JoTkmCR2U/+5h8eajZc3UcAAAAOdBnuxFFSw7\n",
+              "/wBlSP3uCsGGoV8bqfG+TF6JTvUuRSAD4pZzJUFnxrFOJYnshFJtjPOw7rAcguf7FPJIlPqbN5qs\n",
+              "fqCPl7TU74m2w4/OJHMnDpS1+crxo620hZORUqqaN/UeMSuSm/KKx2/MSsIgkvOy0fYS1MAD67Fk\n",
+              "Z5FUhBYQOPZatG+Xc3Icj+kvLjp5v9fX+nJsaNN4CCl0quEK1R//8eZO87p6DKKxlnRfV62uCNE9\n",
+              "o2MWYwf9qwHYbtyqG6I4xWPTngQnrsOmiw1Sy0bIvHiKKw6nsCsKdLVPqCFU/q5rppy8Ah4AAAIT\n",
+              "AZ8LdEN/AI9CIO0JMMhrV/0AB0HLuqwUdobO4BdVbPV1Ioua5WZC0IWTaPE/7qAFTCgAnl3rAoSn\n",
+              "Kk1336t4zGyyPYAAOSIcqQwF8zee7dn7XFk1tvgy6W/qOMTmkEiEdwceoRsnhNmrNp/TK9OoMIUg\n",
+              "ShyIuwXG8nP6tDCpAEYSuvpzo5kchXf9jICMUEGqQZjLulIdzbNUEecLTDRk1r3gpdToPPcXdXTM\n",
+              "AElxf3acmkXSo1kx4tBmKJrXm4kNQ2oDIaqLOc1dGZ+ccoProxsI+jQiCldj17rGF1/E4alcIa3L\n",
+              "dIofRLGOPkev2msNj9eN+tELiQktxoUq9fKnDsRx9Nbc5IkysRYA/KsIu02gpfPyisLPQwjLSjpr\n",
+              "jTxnZViCfPC6UCMSLVKUvso8AB0eV8Q+lldoHmqd+EeBeeJOkPU3vuU/GQacMWsLnKmVt/65Nw0r\n",
+              "y1AnL9+YKkDmvNgpqgQANfZvj5NhddHche/p4la1cXWhY3W/jmtWxMTkOC4tX16bao5sNwcVWRvt\n",
+              "UHjkDIOIXB+3akBV5Lzaef6YjjT1MeUeFh/FB0tOMV3Bhvdw35krP/ItZ1RF5hRCk1oYqz0ykGZW\n",
+              "YkciBlvCsweWM2wXwX55h7SZHtxiKM3rO4Aff+TOWGbe8hXaapPE+4wKof+j5KoQ530gP62KsQIG\n",
+              "BV49pf0LYkAEd7yVzO9dhYYFAAAA+QGfDWpDfwCPWoxxjdaiaFtca/OwfG9dSAC6jYuqYuZmzKSC\n",
+              "kzbTtnf9idy9v7frgKuFjQymibohZCHRXBQdujo9Laqcw233I4Za+//Mdf06kxHe/IBTsCsxcSfV\n",
+              "ksVUEdqCe9dEwWwg//4Ee8Le2gLXqz21e4jiFyBOjP5GsM1hpupcfwZtr5Mo/ou28BY4QZExXJ0H\n",
+              "FzCqK0jKq6c//ut1tsd+kiOyZUVGRAFVkS8bi0vvjrj3zga9Zaa6Mt7yQii43DdcrobbVIWdc0QI\n",
+              "3+rsc8fgmOnJ+GJGdWYzpFLd5zMjS5ofw5IMBt0GmHVcG82Z6YQkqKJHzQAAAe9BmxJJqEFsmUwI\n",
+              "I//+tSqAFjc3NgONUfiwAKbp/vtZn3NtK6t0V/4sA0MV4unWIJlE1N72EjQeUPmvxOpceaVXIrAK\n",
+              "21oMRdsBwM4wyEJDPiji6fXmMlmmsCvOtr78Aj8gA+xKnVDFjoVlH7PPNvnMo0iZJruZeFy1B4T9\n",
+              "/2iVnlLy1r3LZhoykeyNXqaKEANWeqYl2HjpH92g+fHSONko5D2m4SRKJwFWFllUBg2RTQ3etVYS\n",
+              "PdQGNCLeaZwhH8zjnIe5Vuu46VBC79Le/PF0x5A18FileZQS8Adcvcamp8leUQ9dML537b7ARaSt\n",
+              "9Lyu3Sdke9BouNe3+hTyxzxAi1Setn//aNMjVtdKZIT0wLvPIMCsfe3gvhpNMtez9cWJYRUO4qU0\n",
+              "Dlg6h/pUIog+BzidDDvn6SZ9WUgEXhGZOFeOBYowQfwTGI3ac1V8O93aTpJwa/om7scQbOrwAjjK\n",
+              "gaYt9yqViBt3FWYRIoJJGYqmGJkf0tLvcymA+Hyayho8kg3J33tLzi7Gkd8xVzsn0AbjvoJ9u5le\n",
+              "OKsB4L1kcStddnytXouu9GStBCQSRLPeb+iGeZTwQ5uYY8D5fTAcb3C6Ob+B7IWRbbytzq93Kz0y\n",
+              "yYvbeUq1qJCNW3/zJeXeH+8yV69x5FRyM+55j6UAAAEdQZ8wRRUsO/8AYsUcQvOGOSSADI46r94B\n",
+              "/W+PEO3biH5wUahFid/4E5wZcJb1S+5KPsyD0qQEL2HibG5BPsDLysut2eDJfU6ijjP6zrYmNEWR\n",
+              "huQfgh9NsMVuoggiphkYt9ccXxVhYHn++9K8YAnkm28Kzp0jUWHgD2VeIoDjCfJPNnBqH+CERm3s\n",
+              "nubUQ9LmttVf/+MNJAJgtOFW5A6IBAcBpJtd5kPS+zJ8VxzguhOiD6Pf/zfgjMDUsehmT57QUanw\n",
+              "gbdNgBf1mSXZw3Czfs4swXmaj+42V39PQblTRJ5hVxxBfyBMHdtD+eP+pUlQP8pBAAnf3v75+Q0T\n",
+              "L19oeS5dx79IIwiodA3vtFf2KOiU2gODZqY3kJGizWNAAAAA3AGfT3RDfwB2j3tYlaKo3hdLneRM\n",
+              "Dlhayh8NourV4B4kYRi+kgAOdUf8hAGAI5XCPTeroAwXn8G2yGEphnv3FPeZqmLNmvgLgUkPciaQ\n",
+              "A3x0WVLvMk+lZn6cJdklOXHEnjNKsClw6wU0RbMDBk1zQUzYb/75rZ2h0N0KqL096XGATDutyhUZ\n",
+              "RVkyTgfbEgHdPAmzdroStgpcOUEN4xVVZX2E+XrryGs2/tIi+iUaglsBszkGSHUeEuoEpHc8PRHH\n",
+              "tDc+6s5rO2oABm+Gux/PUd+4yoXEBbF4DtdMIooAAAHGAZ9RakN/AHaNgkMVTymoPnXABzXUf7nM\n",
+              "R8KlDfCSlxubwbY5y13VVoGV2GO0t+vExf+APmeqLrIGM9X5aCQgGSaQJX4OQoECqyNRzFZQDLhW\n",
+              "KA4dfYJp7oYRPF8AMOzGYqm7AO7w7FtM2J0yD1XqM3LrKYS1dGZTAzMM0YXyhFuS7+8HWwRTCnl1\n",
+              "B1MtLMYaA8qvJY/AATH13D2takXBcx78I1sCsI+P57X6Q2Nh62/bggQuV3uhAAN0tyrIgbNQYVBH\n",
+              "gFwoUmXrxaEApAv0P2E40tM9SJDDcZe8DyE7ljCyxGjQA+gKJHzTkZCCQsmlxDg5It6wsdQ6cusN\n",
+              "DyWnlyoq3MMo7ugMYcm1YMEY73l36Y/R5wo4wUzuNvV2tJ3rSYBCfXsVjc5o1oA8OllKUpgpBG5u\n",
+              "9AavXOqCqjA07sUF9WlQ9JPrhiXa9bThYRp0lNBazKKlKwsBPK9zJ1/OayuptCCUOtFLyDYWpp2k\n",
+              "qNXWH8r0IpnJjxnQFcNmI3LKk+rH0vqX+48vd2BUqTcJ4rwX4e+V6oU1+lJyU8fmS4Kj/iQFUx5A\n",
+              "ntiGKLVWwqfkoYN2YexrEPVBTpKi81wf61aU8NAxYQAAAjdBm1ZJqEFsmUwII//+tSqAFj3B7fR5\n",
+              "G4ADaQx//3+BfZIcqzxSrotcVc8CLm7cBBc8JifUTg3KyGbsl0UtvUGR3t77PRffuzjjVfcKeiAp\n",
+              "EmDpLoqmMXTQU5wmHksjapt36fasfEiGyN1dOKyOI9nT0TFFL0pzQSss7Ux5GajOaQUF29zSIoeo\n",
+              "7hOusjWiFyZylISVuEBU8nCgDYn9P601XpFko2u3FAuYp/svCLJOzc9W7b14FY05eVZdhfmiv0Wm\n",
+              "d+i5ZPIv9mhB+8Cb50V0LQeFfsyfPeAABtfp/HIPaN+amWONE9vQ2YbC1JsqKljPbi6Vrd258gHB\n",
+              "PNyXvESqATfkK1Gnk0AWxo7XFr5y0Ce95pJr1n6gAd91M5RV5lL/XAgE7sYG4524aA+cXAa2XPdd\n",
+              "1BugfbN6YGWbktwAoVIXoUq7TnrmhBrw2FHa1aE9uMJerl9x/Rs847iKP+iuBUD2VIUOVa/G9Po0\n",
+              "ksPo1bHVIsITIKnrhXV1NabDgHAc5kIv+PJk6IroGA19oMw2I1d4rGiaYQZE9dmK1VRARJ9VXDBJ\n",
+              "Vlz3aoQhCyQZvwzvxWhVA1iU1RO1TWnJsppajNeO4Vg4/b+BSviIvrSwwqmjaRr8iuCpVTgz+ZJ6\n",
+              "95zLiSdnoIFqQJA1Hz4YR/KIOmAfhTTnHcdDelso1m8Bx2oHlzAOiYwR4NhSSRD6EhhCU2kXf5vn\n",
+              "vYdShk1Y3/pp+Wd9yZwIwTneJB0AoI0bbmfrtbbWj1oAAAFQQZ90RRUsO/8AVxVRwqizyog1fzvw\n",
+              "w3oFk0s5kH60rPhj0qbUv+9nJnU5H1hbksC+yivmpdt3FAylOp/Re8NoooEKQr4q7MX/kjNCB5zj\n",
+              "aCmG5E3TxVGWGCYMCsdEF1I+HuXX2a3wLCwf1iqCfznNMRG46GE6nIgxc91oY/zfMduLLCzyb8AQ\n",
+              "b20W2eRODsXd4+7XC1RndLreJ7Km543AdL1iUo99hYdoASXjyWRNv6wvJrmyFngIDlQOrLluZf/9\n",
+              "T8Y21pcggXpfTtvdj+B+3lZv29AFHkL2xGPZvyL4UyVUgb3U1DWd/iySeGzlK1IbRNu7obP1czi4\n",
+              "Rchm1nI/pS+cSuamJbhlQHIreF0u2/zcrSGkuOpbObSfAY//5j6RVfcQovw5wL1RQN0tcA1GtFxu\n",
+              "ZpovaLthGUkeOPh8iV5bEpupJR1R79Ew1sEkTDugAAABwQGfk3RDfwB2dNpntdq7wHtHkfExb8Mi\n",
+              "4AOIW+6weDVD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJWyNJJfpx2maEKeggtR3RVEAdA1a1truYO\n",
+              "N3PBvt2C5hri51AyWveiUQtRNh8OhcT8b+NVPo5dLHlfN2wr8ZipKDuUP3k1md+EiPqVCrK5TuMQ\n",
+              "knvfHHEV8fXqrrFiHhWYrAGbSJdOrXgrQTN4JDv0LMwXs1Nl1nmEdfSgT5BF3DohYi4r2xGfiJcJ\n",
+              "KMZ1oPHaRBjgxhu40ZP5HqUG5rQWHD92UCH/Terh0cf4e0554mxHgDF9CBXD2Ey6LaV8LB9Jb9nA\n",
+              "f7tFFMQRIVaLiP+uig+B5OoeaCY5+GdEeHuY+ZE9jNToZ4yOUwNfysZaXJBrtfqEkQosI3EYRZQA\n",
+              "COu9BHjZjXsKjEmWe9Jj9yWusbXq4WMANyEJEPNSeDcqy2nLsc2OqSE4CgyCqy8blbRZqycUiZt/\n",
+              "3NpFflI5dk/7eeQ8Uo727U5FhceNm/3Tv/0N3CZNlPGV4f+3/HHJknpIjibzMw4AkTq3Lkxy1XZ+\n",
+              "FA9yAR3cZ0/eN1EscyudULe5dTvs1EvlYMWBAAABtgGflWpDfwB5Rz+lHWcxYALocP/IVGxKQ/5l\n",
+              "P8Y+UVeKYTw8iTn+GjVV8vbhgCZ5cI/70wvHdrfJYaZZyRIawh8+61+/vwo8HAkEyAQL0QVrU8Db\n",
+              "Z7+ORIRATWUQyS/LIyP8q4/O5rf7OuybqgrrJ5JQm3dvb5EYgnYLHCULt4xtpfvTsT5gEynxu9HL\n",
+              "Km20sO4q1oqcF4MPx2dj7xETa3veUfVJqfvwop/9NWsmPrdhY/wz7rinYt2HcWm7+ulSBZtWIRv3\n",
+              "yMRoNM+lyCvZDr0PaN2HfwYWOYr/NgyLM3qvI6TujkJkGWBIPuiFK/SHsSPx7iAMcrZ3CQvQC1rq\n",
+              "psLEx1Lx0vtWsdQAcjEYe6l7VHqUFbgcjcHAYPQIIgi8NauIxLhxUOQnkJo1mXO/e5w2N9AAHA22\n",
+              "RlXXsFU92TGe3GmYdLlI4OC3IklyabPhxs95veQzY6n0a2BnyANXxWrQG1vVVVAYgtb88NEdo6By\n",
+              "gCh1aEE1VpUTP0of4shaZpNk/2gd6T34r4uIClLqdADAAdaA4/epPc357p2Ro8OkrT9okATGaQDM\n",
+              "AYBiPC2kAQBkyn5ImAAAAdBBm5pJqEFsmUwII//+tSqAF4In0o7iUdIU6DQAMu59v/f4eNbK2my3\n",
+              "LFfU4bVvmOXvurgANJp+yhdNshfKZWyf1yiq02eNo25TtXkBg+c9UZquU5KtxkSr2wTyRJb5fWbg\n",
+              "+NL8Fosje7XYkSxYEiB3sVwPhHSvNWh2d4v6fN1lP9qvuUnfb1Bn+TdruqmJdM2vx9efbO5Th2CP\n",
+              "KiH3jeuRzoCzSIUG7cY38FVzT4nUIJdz+2KjjjJ0E7ZNKQ6lROaPqjFN4utrXaZfqGFX2nWmlL+h\n",
+              "PxS7plcEcSC1oWpbRWphWgodqD5c2VmFV0yO9NkxWYeDoEeaPVORAB/gqWAbIHdoZVHMBBV6fLyv\n",
+              "D3u5FppjGB4tzB+WC5jnXJKg0Sk3SkInESay6cwWUVJt/G4Tfg6wbMdEkCvCKlRosg/RTpp5P6wR\n",
+              "Z2iZfctuN2EQi36vtriULh4PVI/bw9ZXWlyhMpAYPlW3C1NvZrlJMNaSqGSSnh5cJMfrxHquXcAN\n",
+              "CTgojRhZ3tMe14Ny/HV3UfnpEJgrqxN8KZxlRpYS28Q96uqEu6NBBsBIIz0ei/Mg1x57c0aguL4j\n",
+              "dVBDXATm12Zi0uXfiRBRiIror0O2CDrlUQAAAPNBn7hFFSw7/wBgSQL3wIE2Tv5B6OJXPcoXMcSb\n",
+              "cE8qv/1v/uy5HaAJNUQCTSWlcVovOwe/GLZOdN2BNEgb1OlzNEinzyASzg3GuZ9zFeyJHe/zvxXW\n",
+              "qHgQlhmuH8QdE1M1s5tXy5mwAyoAiCrzupaN60ez6jWL/yRvGdGiPt3qJJLeMG60zAMKa7QhUJFJ\n",
+              "FMWUFrcLW6iQXx7VTZR7Qo0gz/aCe+BxT2h34J4bdpQTH59SHjOd2X4DMr2kpW5buE3EQBEKSUD8\n",
+              "yEiNy7MVRtsZHXt1V4Pb6TljTGXtC9pzGwEXtgadiRP8dhtDjxgpVN3IyoEAAAFOAZ/XdEN/AHkx\n",
+              "u7J3fsEfo6cXtbkNOd4swcOB3voAJyKHu0c0/MGiiYXv+2wca3XUwSOEG+s8df2rHPxj/J/Armyt\n",
+              "j86AAAWOWZsl8AgjGF9fWv1mQf9jrWNuA4APvfeLBFbZJZm7otp6Fc0DFqB0XCbEvLTkRU5ySc7e\n",
+              "Y4CD3ziWyxgWkLgxNxAV0V3rzOqUGhFxcTbBCJI75knYyulzgB9+SazwgLVSR2N8nND844Y7GLCN\n",
+              "0aeRWZgNIAWJkPPhP1VnSRo1jOpV+axgAXL8ExpNwIvLk+O8lekZ0/1o7sI+uJ46XyI2SuA6uJHd\n",
+              "bwUKNMI2qDKAM6f4kKlJLSQWqzXAi8hAQzI017i25Vpi5npQJ4TsJeyOHRvmO1wY5ZnIEZHyhgB4\n",
+              "IoLWrdA5opbAou9XxH6m1F6osqepeJLd97Dr7+5BqWzoHoOLhOxNwAAAAQ4Bn9lqQ38Ah1fDGltb\n",
+              "SoFNBABy4LNe514R+dnaDTYn5E46OmsRrJgYyAm1lSXdflAXI1+CFQXE0A4eKb0poyZSLaaXfRBJ\n",
+              "r/tA3jW8xYt/UxFDszVrqnPHP/Ny6pw3mJ+pwWr+YYAHxNaLyZj85nxRNPFMUkOr96iCB+MslYrg\n",
+              "cr/vUoZCrrFka9nw08yFJlyN4Ky9KHUYJOXDrBIiz8KQQaHFalCe3rENKk9raHLB9E2PdI37xydW\n",
+              "9R3Ktqa3KW5rMJCOoArO2/3trkkCh+/FDlbsei4VdbDQ32DjCaAkDFjCyuqOJNsi8nSI2KDSRFCB\n",
+              "83l81kCObhPemVMTlMBQzSDvOtDFUtuVwHtirD8AAAFqQZvcSahBbJlMFEwR//61KoAWweTusUEY\n",
+              "AFR7WLigAceU/KgvW9LBBRTRioW652v1Xpv5tYMFhkRmmlUca4/8lM9NJwOZFgbdLq3dhRjr1SQ+\n",
+              "iitgTnIKVe77qt/yWy3INzcVxffYfGucVy2ypyvLSUZVvVzu37Ufe4d1uKQAC1EE3Wwzkx7sEK4N\n",
+              "QwJyCdTZZnLiyrlEXcLAMbB36CvMtmCiaP8XPpa1U2RaJxnBB9qYeP0+JCORflaC8m/hyWfMppd0\n",
+              "XeCFuAYTEakC9vO4HVF02QH4GZZigg7j7bXnvstEtP5QgYZViZcOoAaQGKtWm3PCHoS8mKWfCUk8\n",
+              "ZLC6z2a10V0U2DavVH2m02W1Lc4/2WzrwUTHr66DOaP+urnPdabeHdXruv1HJ087InGSipJtxGko\n",
+              "4rppNbdlP4z6g2o/ksCKcSZ76uS1diKM/39wzVYDu1tkCD1lomve9NoQwUToKqCn30PDqMAAAAEr\n",
+              "AZ/7akN/AIdka2XuDkeawxOj/BZhZtP+kNbRABb4RmWT8vSOMSH2HVKuz5/n3pn38gQM6YQqY5bV\n",
+              "v8KsLMWKt//3BpX7BUiSjA/GsXEpiGachc2o+KqjjRfujy3SLc+TvzNfgePwT9w0Jj9Y8j6ORxA7\n",
+              "13x9/iM5Lx1s2OQQyRluiOYKxXDE9QjNulPCcMLJFKpvAfnZmzl0pzzHw/ANcBEDhABHQ9ftCkUs\n",
+              "Q4pQOQF20mJ1++bXoRcUz/lR79ACwohpzpGuaQCknCVhUL3lnnyQzloB0PAIRq1VnOd+y8D18t8/\n",
+              "IEva3L9FTrRi90eT/2pNxjMaqrOmFzrhjd2kmSd3YBlll+A3KrjDn/HtXx8SDjztM7Km7BEd2LVO\n",
+              "U1pVGn0+C8gCov9gxoEAAAIMQZvgSeEKUmUwII///rUqgBet471BV4xl2QAFRvb+6Uilj9hVaCt9\n",
+              "oXOXB19FM5G4bNDJAOl9w7HrxMOF2dPOUf977Rp9NoBObCR9cN42Ht77Y+l36qfp5SrWPFz3DG9k\n",
+              "Uks1s5yfRvMME5RxPYk9+qohbe5TR7z2WNWBJjaTvhnu4485WU3BaTyIbA4BRRdj0/JwsbCXRVZy\n",
+              "OMmFdXnFdxhNGZ5JMCQy+ip435WTv8KevLzG3OUTxX5d8x0gaiQZdaPwNC9GVrgmtqTc0z7He5Hx\n",
+              "p/UnXiE+WgHU095CwXga4AbeOtQbj0tjxKUoS9sAoJ5fyTlHv9FnU0ujgUuoA3Kj0ma5qF69zgnv\n",
+              "MTXEIqf8zuYuInk435YB6s5Aa1W77q49/ZLR70JdKU9F42nWnuaGIFvaX8JNp0NTGvA0s1VSOWIl\n",
+              "YVdpY6hSPbDqLYXO/LE7X1D3sWpexh+/kcA2B6pYDzx14bD7OD1f9pMDWxIrW6BpNH75M54gOMY1\n",
+              "SxoTsfh6KVoyFK4Yqd6lPKCLY4O17tm0vzqLEva8zNeuM7b2yHKwMHpqK8FV5yaEer9Zd+uSgIqd\n",
+              "eftECExc0GDPrda1mDLPyRR8iDjZRvRS/EElnceTaWiUEonB934ThxItQqnJINdKSyNdNwx44Jgq\n",
+              "H9/Zh55FLA3sdVDr+1aesKMfNmYnbwaje7GN0y0AAAENQZ4eRTRMO/8AYEUc98FD5/CYkGD6VZTK\n",
+              "7qaMD8JeD5Yvz1s+LaCSFWcn3aLtkXWLu76WBTjEp2boTz2lISGgYIiIhTqGBdSAvn4GaApcqQ2+\n",
+              "sy0LjwIg9aZXDdjP9AWFTV1H8wY3dWCf+Rn8X8p7dsAFRxXZ4015PG0t6STtIq5DOqARSPJ32oCq\n",
+              "OenP2L2rQhT0bU7kBXZqDOvuedMFko4K8dbR3EOKtstAjt1gHGNubjQIVeNhJsdrdMtXEY7juX3P\n",
+              "NuPteAILXrR8S3R5mIOtuZ+vWEUdS+Inr7FnZsbQiIv9i7KDzU2m3LJLNdjmArFBBLgFXYHDvQmL\n",
+              "9VT51Mb8gx1TyNar/CPWDggAAADyAZ49dEN/AInJdfYNr4ilmYSAMFB4GADpypoeWWXE3q20mGL8\n",
+              "wfGmH6ZgcbtTXJWZn5/uB2IPeQFG/rqNYZ/bmIUcKhccFRuPa9wOgu4Qnm9oi81y+ChWQK1KoKDK\n",
+              "TWWDeg/SDhV8w/q9dFY0rcekgnjPKbKFgzK+IO7hoMF7vhpMoVCqvwMtBaesBfF4bzxIufyftMba\n",
+              "VRaJWuZpM22/FtH8FxujQ6EjGNr9PHZg3rsxXbkYHRqZvH6RGypNdfKRL4serPMKtCeuCWEKaj1Z\n",
+              "h+pr+ULdNvwpLLHfA3OCu3Ql8v/sLDD/O1LVB9ug+l/wHpAAAAGVAZ4/akN/AInJdjcgUcZACEqh\n",
+              "GvWiTtr19IbQdv8WE1dBOa+lNipi00vM+C9W8F7IDH0aaS+KKFaekfOwUNG520lVemVKNYbjnPl7\n",
+              "LimE+s4N2NJ5SYT5+XRMb+vTvKCkG/By5wQO/WbZo9HorEm10+Tu4CVIj+2Ky5hDZl+kA6mkBK7E\n",
+              "3LwAW+4rGYiO9JH1BLFQj0ZOJq0ybrdVynOYOw8TudsCI+I3fiT5nmYCkIO1N7h++s67fASBLfgP\n",
+              "CYo7yLNwfifRM3ay+JhoRmwX5tGJ8l9w676Zo1wDaqZ0Q5guAYSxSJk2jHShR6LxlZmIVJnq7S00\n",
+              "iBOM0mxomzMhjpxeX6zqy/aA2SEREi4ulxZsEvlIWhLQ5YFv6LMkVEh9RITRQOsKGEls7Y4eSRWc\n",
+              "f23FGWOVxL2MZUmPGVh++Xygx19XCiXwoatt/s2T7zGfLkQ2IBiMKXoeDb7yiR4q+0v6UjACWT2H\n",
+              "kOIRMpG/B4KQPsfMRT0Rk3cAwV9dNnKm4XTlo9P9TmyT71B/Greq+KvhEBDxAAACJkGaJEmoQWiZ\n",
+              "TAgj//61KoAW5ktFwTkgtAAhBassVgP2a7WSOTniW7GlpUC5YARIimzpboyDKn/53KIxVBS+A0NS\n",
+              "3NuuWMzq53zfHvhoSdYO4dYooBUDN2VkLpVK3v3kQo1FoE02X3cyV2j6ziOTJORgWGzqU5k0XKJO\n",
+              "1VCPDS1gJclQYem5NlGAENmSiR9I8XvNQLGvpLGF/2+aU31xCZzIPp4tUxyLu/gVqq+6L5DezfDz\n",
+              "gPP3+vv4JFttE5Nyc7LysmCaQfUhi6zPymHmdLjs3bZdma4hV61UMMsGBNZfYf2GUkV1dVZ9kkfz\n",
+              "RyUYJPFdwjA5S++T8sc03o81MYXnXYkO9hGiG6RRLRRV2fPSgGhghnaqxRhYVQiuVS0ENIpjxqqc\n",
+              "KBEaAMs1VoaLKEOrNhZ8yB1VLLV9KSiM7/prkkNKRuNLp0WeTv2eHtXhIdAfhKb+ic7Pb48CqpOl\n",
+              "FnnbgphlxDaS1dplrA4VxMNzEL/27xNMQzhuRvnSDNb60j/kSJHw5x2JG6G/VwCoVAfFrZll45AB\n",
+              "Puajv4y9+7flMd/pR8Rg9UAn+cey+vNCcCbbn7FNSWq2hl9cymk4fwW6iqBgiFEQ7YZtyDoNCyYz\n",
+              "KAnW0gvHCg+5n6+qxC+xDS291Y4JfSW927ZZudU0tXxvupwcKf6fDXxz/bqsOMvxj6Y81+e6Dezh\n",
+              "B2/8nCpk1Qc7N5s0JoStEQ8+K2ir0vIXayhFQIgAAAEeQZ5CRREsO/8AZTZTJbuKD3PiQhYpzA/Q\n",
+              "3Iqsld8XUz3sHppFsAHZevvXPBLN2cIUd+YCbEEH6MplVFEcbuDDV0dnlBcrCNrbp3+CAOdBsr6h\n",
+              "0YfLGDPxHlFlUCi4qTS1o0TT2Jzkq8/O+TU7SSImG1EjEmOGpKvxjn7KxERq2Pbd/0y1sNHk5hiQ\n",
+              "eJwHwc7Z19aIrWes4h3UYQqHeU6kfCpUHVgnGubU2A0Xjg0UrouNSumFogz0StLk4fuhL5slF3Bb\n",
+              "3NpP7YhgiVLV0FNM21/pfbXvRQFzmliOaZuScgePqa02nvOdEHEpGVRPLCGL/tvzSkZqhXResmQg\n",
+              "1qZ/TxlvqjWYqPRThBIk2nP66jbd6NLagdWz1BtbrwB3TQAAAVkBnmF0Q38Ajz7dDL7wKLyRAA5r\n",
+              "u/5Co2KbB/AnQg3XvWeaImUuto8KuobiZ5Rpi0jf/+r5lFprj/mYxpQ5OwqjQqFG0eXwqi1D6M23\n",
+              "HLH/3LvgYXkbAAGr9uWkQaEU+TeJ38WNXodDC29t8Y0uYEpwNzyC6FqtgkCyDYDpd/nESpdVRRJh\n",
+              "15SV0TP88AKwZsT7yWH2r5gpJv8AhXnnWmKJ/WMwiS/2+Kf3ikj614P+BDohXhMYGO4GSZ19EkRI\n",
+              "RjwO1zoy3Umd4iOMuBBPzevAs74sU7IUdkUF24rNAstoyqnAUgY510L3SgPXbZmJYMv+tRpT7ZuM\n",
+              "oLxE5ACIQ+eHStmGZgh2P1nvrIaZRiBxoWZ1B+DDOtu5OZpc7LbajGP/oy8HbEFyJIcGXHGB5VXY\n",
+              "HnskMmabuu5xyFIJcVaqbGg3TlqrbBE29OX6xO7K38oavU/okVlIM+AAAAGEAZ5jakN/AIdXv9ZL\n",
+              "/wCpeCQF0zyG8897iu+TVNq8xXl3pE8eXm424VBKoADmOQ/RgBgC6Y0IzpqUKPVKwCZafdEIuhUv\n",
+              "zhgtxewRpr3F4VdMy9NUqqvPfGroLPxDW64Af18RtCEv8t7amX9ezvEWK8AgZjHjHXeVi2k8dp4r\n",
+              "TuMjdngEOGe6y0V0qXE0vJudyGSblaiStnW6rV0e34JxbdN3Qbajy6ozlLfOkq7Wqx1iLXxa4foY\n",
+              "IPBIjzxdye8gOjZW7bP0axd+wppVHkXrrvuxUf9dp18AanJIIFv6MCm6ujRO2wyu4ZfSbZp/KVFm\n",
+              "xvxpBAJyjKSdCoPxWylEDyms9NAmwAADmUiy6WUOIsiAC130X9MRKfeLHi3miJh/YDGeINuX+P+e\n",
+              "NWBXxp3RqAzo1eISPcPztmgXUHCSN2VRpnCOFQoF4yyryK4v7s2U4a7V5e2sVJBhb7kguiVFACK3\n",
+              "rbLSCnWI4OCs6u017nghnGW3Juq0rF80iqmo5QCt19S62wAAAkZBmmhJqEFsmUwII//+tSqAFu/w\n",
+              "HjJpMYeKfGxaFh4NwH9VzFzipiNnWLhZf3lim8qQP0NcWviT9hCfSjxxrnYEE59yPQn7u6+tCr/u\n",
+              "vn8/iyWB73TxWIDTyqwOWzo0R8Wj7McP4QWP8yE0svd//Wkug5+3cHmcpP/ONbeBn+TAQ0VzErlc\n",
+              "2hXFLnmGW7EB004qvGi/S7JfG21T+V5Sx9Nre0PuomioWltV0uJSYiMg18UwZktQhoyeO+qpPgky\n",
+              "U9/xX6NUrUyAfCz03v4wSV58lpzV7BxftApX8ZGWBx2zWQV/YeOCEWbmbHqvN18Jd5FxK1iHRqe+\n",
+              "nBGg6SyBQEQQfCMxCo37AXM212ulRN9X2fE3P9HkhvkaOxQZ5AElyFJ4BlaM9J8bcUgOX6NS6Cqb\n",
+              "n7IHMcCIPjAIJ36atWVr0EheDYyrwatT/sRxqfSoF0RgoVqtGqstMXZF7XACu2N9LDV5Ss0B+mSl\n",
+              "kJJqGxc50wazbtpofP341QOLrRCoQigLO2IFkJyqTpln4FgoWIMbx8x6cKkFmIESXv7mZEx6LOrL\n",
+              "ggZa/EdzllkBPCO/+zBjmey1Y55MrbMpoidNDpdQ6yZ4UDU0ai3HtghNjtrUaVDC+dCrSCASLB02\n",
+              "bO819PX27qwUTWW1MCrVhUzQkUkht4Xa4bdnUW7zTudPa++EPxUMVY36vPDJoCGilCgIXzTOV6S9\n",
+              "OVTh4+OA6S/XkcoA6ZjbQLERX5kZSQMoFJs4bPot93titzpDSKAhc1QMx6eKK6Ol2IEAAAEkQZ6G\n",
+              "RRUsO/8AZUEFdKFRxHYcrgnLV1IJewAc5dAL6/Pr5YWcZb4ejev9b/lpY1ea5Xk1AlTe44c3rPkF\n",
+              "DXI6yAdEC7kxPh5StAse03AARSF2nro+Dr5bfPJyYF/ERJ9NScPmUIVihvTCsyh5qmuoAH9P7eCu\n",
+              "Y8rdH1hF/pTSa+Z1tzZc8gwGtgV/YsMtlWLs3VbLWxt2KTDW5Y2b0HA6zgNn25rXu72r6iiN5aw7\n",
+              "sjFipq/8rjgHE9K0EK2Opn+0SPK2Rbo28aoNdC9V8VxW1CpMNxKjFOs8YmQmJE6Qtkw+Uo5mh3ic\n",
+              "7Ng6Xje5wAF7a8Iyr8DMIwvMZnnVp6ilQ1B/LSGEPncviRIHH8w83Grtt0CsL1L2isuyMboY11N9\n",
+              "lxQPpwAAAUABnqV0Q38Aiz6zZgMl5b2XXQAXQ9yHCqNv7FVD9CxHdTnw5pqRTLAoFiba5ss3lqXG\n",
+              "QCf4/o32jzmzNKjZDN2ghdo3OS7n/NFKTMs4yX0NTqaEhdnVRvrbcGvcKo0NYMgzE8UNwneueU22\n",
+              "1vpuKbOkae4P82iS9XSi8TlOPcF8mmD+n9qfVTXzL4r0M/s5xxZempvnxqhz38EgmSM/Zw7kEyiv\n",
+              "giyuP/YjNhFl3FVcOSLiQTCj+F0nLUE7lia+UkuO/YNBXwUKZKD8Add8BG6ZTC4bD/RSktc7uv8w\n",
+              "NB82AXgnpuELTB2xZFOLAYJncjo03/3uAK678Cl8cw8fzlbnSpp5eUkHacCUtAY9LPrz/OMf2bA9\n",
+              "vBE2eUwrxz/W0Sg0tjzkUrpnJSF+xYsA2fgRolT6A0NA++mVN8PJVhaGzQAAAX4BnqdqQ38Aj1eg\n",
+              "HO2BrhbSJp3bjAA7Lyx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v6cxSu0EEWzwOVr17m7uMIt8s\n",
+              "rOS2NL0s+wNbNsQiUhFGWcubxLdtukca9QFTdaQjRXuW15l7gz2QnuVPe/r9SLMinrQ8TAT7c4JB\n",
+              "GrUpwbYY2wvPKUw4NOIKdjGz2TGxM02Yhqm+YQD7nu+MPeXg/5dBf+XeKfPK+RchTbfnRfx28pUm\n",
+              "+MUq+ynmpWVmmfO3TbD8gZCbZRUeK4LOH5lP3nvVvkbZlQVhN5vPlxxNouZsDfsmprxmWrHzH3vb\n",
+              "E+c7VsDA88L9wCH+ZmQGzxFjyOQ8cz4P9rsZSuU8vQS1h6fmk4XXUosrmweEGKJT/Sv5qb0OG8e9\n",
+              "voRxFaPrroiqkALWSnA5n4zcQMwfY/xXX1aR5rslt9ItB406qJIsbsrkl8pXUe2CwOVm9B72bhd1\n",
+              "lqsCRNktqyPMF/Ek4JsxscPvDjbSqbQZL+uT8zjgAAAB5EGarEmoQWyZTAgj//61KoAZQB+OVG5p\n",
+              "SZHABUb2//v8PGtlbWZ+A0oGGFPTAdgmU2TFbsuJ6mwUCouNe8f1I2ythN04JSJ5lx+ik6KpnC91\n",
+              "1FD3eD5Jit+kJIg5holbnldcijL50GRMV+Tt0L65TPBxqSAUdrQu+eLUTHPpJCL4CV5RJau8pEIv\n",
+              "uK3a7QA/UMQ/nrDjeZ6jqf1BF3JjbyaeIc5drvnYbR6lQ0gBIzp/QRU9xrHm8FESnIe42aooWDJ9\n",
+              "bVMccs59QBQd45WisW0MXV7NFtyepgfK7biPJN57MDsWL2A4LYHAXH6f6In3GVsSrYQ2HUKGlxpv\n",
+              "Yf/Xvk0pBnHsuIEsslXTjxwTTzuRb2YT7QCJp6yHiUVL67n8RfvHMNoHfUzP4rVgPSXcPL8FOP2d\n",
+              "F8GxovHNOmsOSUyc+t9OZXQFF+4FJNSN23FsgARohBEJ3c1u0ax3ACLYlwfCd3/U1mT29ftZkWMR\n",
+              "uj01t9v2AGHvgKM29X2Vs/ALzLNDd2OM9z+AC4TlcpgcRujIhnjHf17Je/8RMBqJCZtdfrFmz6AW\n",
+              "Z/aNIv/p/WX6adpvStFWxoDAnf+Tai9COS20TO4GHDviQkpMo6tbNTk4tiYWsmvBNq5u/aO08r2y\n",
+              "Bs1eH2kAAAD6QZ7KRRUsO/8AZUj9pUTz7rNMoHjJ4gSsLw2wABNFEVCVBZ8at73oa3C8UmeDMVba\n",
+              "M3uHP8p2EFDXTkl9EiChbxZZgpuvefKfc50lYhoTJ/7H62X0Z9NX2I7S32WT1XJeJtD32zfVBu3K\n",
+              "VmE+30x6+W2pKnyMM0ZejDKLq8WyIyi+9rC0QVVyU0N739nDCyt6aqRfMfSdljqTnwOmgDB5pHyK\n",
+              "U8Nf/BZxnIET5uBVX/VcS4bjmT9sCYYwmAz5vBy8cv5J53FYPh0/wF7kP2myhm8SfTnmNtpTej0y\n",
+              "JjLbrdGSBUAu+lwbCsr/YdOCYrxvvrklZP4j4s5VlQAAAgYBnul0Q38Aiz6zZf6skuDOogA4jl3V\n",
+              "YKO0NncAuqtob34dJ/eVmQtCFk2jxP+6gBUwoAJ5d6wKEpypNd+AlIf83kNIAAC8trXyGAv3zzzV\n",
+              "tAa7kzCHOXS39Rxic+qZEHcHH0Hx0iIZnH1UNeoS6dQYQqolDkQpOXG8nP6tDCpAEYSQsJzo5kch\n",
+              "Xf9jICMUCBjMQXeVS1i3FdA07mrKCBowVzEdee9WvqvXV7KuMTufiL0hA8BHvtD6VFvEZ6eiqgvN\n",
+              "8RNM5cYXQ2i+4Lx4R2QlAIN1NNxqM8GvSjSh/rgipqY8DwHJh8p9Jbu0Zs+w86pgxJN8m/cvWxRZ\n",
+              "yFAtI7sBhDbJnNXx83ll0o93YVJhxi0TxWXPf6PlHZeEyvr6QOF2VVafQjsZUg34P/p6tj3lkAer\n",
+              "aZouLIrbfbTrpoGdtXuXR2qC418s780GZsUBVTlvppC7dgGYqQzB5daoV61BoiIg6tQyG20Yk/Ib\n",
+              "TtwSJmeU5Eiu/zRo0bpbU2jgV79WVCB/SVzxsmoD1jJEhzN1FHxsbajOijl9Vp76GofsezNr+37n\n",
+              "UWWhPPzCk1rCLQgaI34ekcMUWq/vBK2WDe7wKACe/5M5UglN5Ct9Orsd3SfYPc0336usW56marFA\n",
+              "xW2XgVLc1GludnoFyQrT+oASHSl68jJc1j3I4WTIeU/p+eW8RtUF4AAAAR4BnutqQ38Ai1egJmdK\n",
+              "YqnGBlYUAF9obzNVJ+s4Wyt0Rq0YuZmzKSClvCu/741bUzMW9+2RqBxHf8xROd9WCD2DFO6m3iiG\n",
+              "ZOgLMC6WQsGlrWDKBATBQkW8M70y/ztO1ZzNQj1ow5FREW75+T8qWeYnaEkP0sDPfhS/8A++EHpT\n",
+              "ONUZpoNHugOpCj8EFvE/MnQhkWbqDB+V4zYJeD+V1h9PGTTPeM5Ykyq4ZMi+8E5Gka9dd2CFXMaQ\n",
+              "M99mRo+FOH0+y87A4U4JusoMgrnGwBHn7tNdR1Jgk+wKYqmIwBj2jGPnQFJXhHhE3ZkpIjaeakM2\n",
+              "8MH5c8xC359KRjK1nfiZHGSkxS98YPps7lGGiAJ2WdM/l0XaVpItX1VPHy/wAAACGUGa8EmoQWyZ\n",
+              "TAgj//61KoAWNzc2A41R+LAApun++OIZUz7EikV/szjfxvYPLx+f9K2/F/he8DHawkBMdV2wRLxA\n",
+              "t50GIuRUSWE/39Xo4nAQqkjDTJdufKMgNIx0erMAcY2QA5ejjVo1tlzncJOxCqGpuGwA+5/4IKyu\n",
+              "bmTzdPecTw0ZdpVPq5j/sb/uUTmyS5oriK2QJUn4uMhurpWU0pM90BFHxmx/55iJQnC/E4AiRjGv\n",
+              "TSfvy9eol7L6q3/AmWDGKQmta5h6TQecJSS7keMMTmFMkcgh+dQEUTFbphGIZpTz6vxfkWPPyqpQ\n",
+              "VmS0gectGBeLssajkGiu1ivhXeMUvGnpqjpc6XSD8FJ8sVdfwdsse9JozsVq/t5YFq5+AnEYcopl\n",
+              "mlIiLVwif6/glDa/FvPVZyUrYuYY9L3TA7eEHe1IcHWSOPxpnafEFBrVGoeZPrbfymiVcHOQ/3CX\n",
+              "aGrpVwdWrmOHr8jLuajUxWOW37ajHobcyT1hYWMxRTx80fZmsfvsrNw/Nztdx7LidHGE8jPZ4gQZ\n",
+              "DABlByR/bof6mTmjqkfbsR1PCXy4RDNnn9nCnaSnb8pCApsF6YsDTv0+UmVzx2ZPSdm2LhZIqOim\n",
+              "mhiXHWt+ZE1dnYkLwTdsgNYEeAUTjY5XG25CAykSMfKGwGWeeOwqKmLAqTmb7mCXXxxpy4+bbELo\n",
+              "RAxOLFOR7z+Rlt4VIVMH4QAAASRBnw5FFSw7/wBiyP2mEJvZyVx6ACpM7CM8ZBKHKR5j7ndOem+L\n",
+              "X5lQTliSlHrc19blDxI+BarmPxVVRFr/CorqLGvI+vHNUfF9L5rOth1seL+LchCRD6bYXJMlctoQ\n",
+              "KBnrSfN8OsFA3rCX0rxhgXIKgdEDuCNRYd4XCiw0AyO8VPwgQ3UKQOwN4T9AdwOVZht3xWSjlGSY\n",
+              "LTfR+DOcni9vpFUI/V99yTFNeriW/Ezi0Mmb4Xp+UrrTAn+/oqePQryHATZ97i1I4TzdZJ6ol421\n",
+              "ZZiGDIa6I2z+mz36WJISXYfn5PcaqZon5evy7wkHdXdLSXQuyy6RoW3UMK1kv4eYGMx6MEUBV881\n",
+              "1DxJ4Az2tfQhJ60iq3lK6xGARpoGTWiGA3pBAAABAwGfLXRDfwCHPtdry+v+2nyY2Sk+gF5YW5HN\n",
+              "XoAL6QRR4alJgXnPRJGLu1H/XzBsCOVwj2OHZ7/Befz18ioG7PdTUWTo/DFmzXwFwKSHq5MESJ/K\n",
+              "+czoaBaMU0SilMUvvgF9NaNkzEcYOJjCpUUkl+lvc9iWY7aNcNT0YkO2YuPLl1ZJa6XpXyzgvJfC\n",
+              "YABMMMlHP4hWdgac8C4JyYJle4OEiXwhanMhhDIkpZpmZqqPP6iXGzuSTb+0ZDMJHqoDGqJmkb8S\n",
+              "IJuvyZGNE4panvJTPVd9f7g4/aXxMPm3Cn3wfT3mTthI056NzanOEWKjM1qGy4olpTOi0cV3zUKu\n",
+              "VGl1k7sAAAHXAZ8vakN/AInJcXImIY9AsY+/nZAB2XUf7nMR8KlDfCSlxubwbY5yyAvaK6FdhjtI\n",
+              "iTEMX/gD5nqi6yBjPV+WgerMVdQiwmsTWCh4ZDRMTEvRNiTK06p6H4BM93iWfwAaKh8Gz9Gaukwy\n",
+              "InHLEZ0yD1XqM2twrrM9K/zMIWUOeN0Z6Qpdges4mCaPjYBUMA0KTxEuHmES85gUYlt0s0Ks9Nu+\n",
+              "2hfyb2t0rmyvRs70WgBBgYrdeTZMCwmoCbRHPK4oxsSlCang/p1gu/DmbjnwYRln/v7ufz7R3gdP\n",
+              "Fr7XrHKEZc+f98DBxQMF82PBbmDGtLAQXHwptz6g5mqHfaJhvvgj78jkqTGrQ4WXMBaKzHGNvGYe\n",
+              "XIR0bHtcMMQd0uz0UHs+NS8bhlZ93PGBn0DI4S7X4qFOiND2PCIg5ogjbfFqU4Kuh5oLH4L3vi2E\n",
+              "bzWP7DaofhwjMqjCqAvZAgznNJDsvnJzQxJ6Pqjj2ny04t1drdQRUisSLN+PcLenLQZbe401Xg2H\n",
+              "yhW845ouHrITGSqb9EOEeoN97gj42PjsdYRMVLRDVvCV2BOAqdLbEmICPHZnyy75qPsejK7duPuc\n",
+              "fJ9rEnjynB/HxYz7zf/RM6xyYbzIoc3AAAACEkGbNEmoQWyZTAgj//61KoAbj1lLPyvb6PAZgAh9\n",
+              "7f/9/gX2SHKs8Uq31kdycpXc3bf6XPCYn1E4Nyshm7SbxYTXwR3t77AgzFtBuE6fBgZeY48yXmAW\n",
+              "rqOr3iMlgArjVOjemrjz47grY/T9rKmhvhaqPi8pvZTzkzZCl+tV6nzXVbBFw15yZW9xk2z611V7\n",
+              "GITjv5GH4Oi/06B5IbjEMVKEcRpvt893HwIyUBXniM9I90uh0TBxOedvsxxE2iLZsr/m/GNXryb+\n",
+              "9as6btju6GU5FfXHAHKy97PxI2Rac5Rx/FoPiuKEecRx7EQrDfRmlggPPP63oMY4jkBeTzC7Drwp\n",
+              "8ik2Z4rhoAMWlcRPfXCI56oe4Jt09oRInuaD3ww9/jGDjhHIXGbNYM/s5UG1XuYLCqaLxESIyPG/\n",
+              "eNnETthXX/QZDvDCFX3YINANkqDvHlUQ+vcUvksaWF/g1aVcMu45c8BoP1coWBAVWVE6iyDMwfYl\n",
+              "RYTcnNfp26mpOfqiSJnYH+AFj0qGJttgeZBuJCzdV4F5EDreo0WWAiq/0jdXljJ+ZxDij/UazQOM\n",
+              "0ct15Q7rTOqLKy+lpOVa/koSWj06e8eyy0wY1FBSVaROGYbDgXze1QzYiVyP6+WTk1fjz+Do+J+/\n",
+              "TxVlHJsfUOz0tbPJ3R4cSjRVigTxPg9VAYynpzzMlIr0/pCOGd4XYyl3SGTwAAABOUGfUkUVLDv/\n",
+              "AGU2ltMhgssRVFnYDYHdfwUIOpARUIP1pWfDHpU2pf97OTOpyP7SrW+j72yMHgCy10/KQJvVenOE\n",
+              "eMrSHUfyq6lVIsdEDgl0M+/NXx5VMpg+IZB+I7xozsY2f0ARjiAjA8ZSqG32YEqaGwpGp+vfKL3P\n",
+              "hav1CfnyaUmopPCa0Y5ww/PZN4YINPOwE+Gg36kaKP/ME/B0d8v00CzvLXmI8pIa3TqrGIa7PF4X\n",
+              "8miGO6oXkRH45ag0gFdgkGj+BD1PvtIptIkuqTa5jzG/NewDN9cCfws/hjc474K6NoCTyr++7Tth\n",
+              "LSIM60DcVje0csuhEMwOmCNob99l/AJp/9hMVsVsEaxUNsWBZFMKnZoLJU/ljkNlTtF1zcUwJoZD\n",
+              "oLTT6FmWVzlFnyfjiJdVIqMAAYsAAAIPAZ9xdEN/AI8+s1VkrBucudR5tN1L4cUDsugAOgW+6weD\n",
+              "VD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJlwit0rQdaNL8wYmpMOBxVMKErdopYTnWfb0EZST9ZFP\n",
+              "kGeAI5wBNyE7pmk7U/hz6/Uncd5yONsvInzdtLdlFGIUuwPsZsiC4nxcPKJ4ER73zqMcPC62dMwB\n",
+              "YeP2JTSzcWxmsY8AuUeSUMff3wugzCWo2dZWIqj8MEevc9dnI6e4RX4rfqOmeKfJ7QFxuPllAOzz\n",
+              "FkyERujhdmr2mdRExctZgI01tg+iF/NwBCqP+hQ0BZaq12BgDPwBcWyuj8PXGo/75aroqbic3atK\n",
+              "78lcQoP6TccBH3q4TpJbdFKZCXZFrS7Hh71ZQxzuADlZ8DDRzGHyvFJs8+7LX0Z3SVEeli/7hzNR\n",
+              "3en2BovQV52x/rwTox00ojUHS89/I6QK5rr9xZ5z1Evdog7ewBETCofR8FQPxE+2X576ofb9SYpa\n",
+              "RU+FFWJ4WPQBj/u1ljXdmoINHOgs90YcpGG37DHSgRaxKh3h9samVWdsr/7ZPH7Krx9nfE8zJoXc\n",
+              "5Frf0sUOO22BhUTf6MatKarbA54SuNAmIi3ejRZKQJ4XCjhpsLBrmw33yy9Nk6OT0LCi0ELysL29\n",
+              "OvbOK/J+/iRz4bP6v+/3ppYXG9MzSEeggmS96wm6yOsevJy9wrAAAAHWAZ9zakN/AIdXwVSZADwX\n",
+              "ZeAC6HD/yFRsSkP+ZT/GPlFXimE8PIk5/ho1VfL2NNL2pqViOd6YYnwc7ksNMs5IkNYQ+fdC2XMm\n",
+              "GpZcBQdS+anJcAkZpOHFxqdIo1pLhI3h3bcsWXXBd+BTXZhbA2JSmhm8EWBGqSBNaO0U3Qcdcea5\n",
+              "428f3xthr08dSK0oFN+HNErgBuKfL3JZNShDHaW66u0MaG1B/cF2Go8z1F6LGKUAmsy0D/C2CM25\n",
+              "q38c827dgYTnZjZnTFxlPuxm+JuWvYpOeWyy3J/wjV/USVL+4BKz61/Ccy+EH/JkQUqRmUOtvYei\n",
+              "XxTdexyug9nI6kyTGc2H3hy0C3uFxKKFKo9PfiwDCQWhQ1+vZIsII4FYexn+pQbkz5kmdlWKB5Lx\n",
+              "ONpNVggWvIuTYEFI34NTLTOf285YYkebB68ywIJ5f1uX/OXMZ5RxH3gjNZ8mKLNX9suvs06qOt/Q\n",
+              "e2ZfZ7Orgt/l3O7GLxwWvzugIsO88I1KhpZhgYDdYZ//1lVBcwG/tKVYjF1obqjtyFctY9LPGIag\n",
+              "318ehZmIvkhW9djj90e+pnWknudbQDv3Os17s3l7qFADdqSGqYyGaSU47a6O12HCRSwmepV1bewA\n",
+              "AAIrQZt4SahBbJlMCCH//qpVAC8LE+AX+ndLRI9AAL65x3/f4eNbK2tvWi3seP5qm31GHdf4edmk\n",
+              "0/ZKv9BuxjUGH/qoYxXDUlaWZFHb65x0lomfbckqRBtklU+1LGTmYtvnPAbKnUSAh/jTBATZpFND\n",
+              "l6V6ofQ5PTBcFjOWwgI6YqalXUkmqnN6g77O4xvodhM7XQWhsA44ADmvatn61wvReF9d9MqoCN9N\n",
+              "Twpkx2kbbrSoHJrSyqidCsv+e2gnLoWDEdLGn/42++dseweQBj40iKRQ7paDrpDRwTZVjGQJ+52c\n",
+              "gaUSUp5A/cAn4FgESmp/sZ0NpfD9/7ZAmCbSUfPUar6ndxZ3XG2DXWcNFu473rzFQZNpJnXg/Pfh\n",
+              "QCQDuu/iX2Vi2NjGs1QVI3BReUxvD8Z/YeLy6w0jDh9dcJGJdKoNjb9Epdy5r0lFeFb9L8AWhdEd\n",
+              "sGreMPdTiMRlq+JOqjdogseyQTcuDo5iesxIsb0dhY+P9VqSJtTxyPO42dn6TXPZDgt1vROlp+Ic\n",
+              "VTutbib7FY5U+jSckVQsLzLRwDuIoa+HpEcHjzuwHMaHrKVljgiPeRI3Afdpqx3nHgy0MFCOhGEr\n",
+              "Jkw+Dadh5qrWjCGOX2K5HPLV0E5qw7krTDhpWX8sTsYsIqvxr/V2EjIFiKwnheBvunmhlbHNUKTl\n",
+              "ykWRC9Afa8QE+vO8sLJHYNqVh5kOrsn0+NP1Mm4JPbYiahSDJa4o8TJzkXFBAAABAkGflkUVLDv/\n",
+              "AGBJAvfAgTZO/kHo4lc9yaSVZkgaxkXEQAgySaAqoJy8U1XmJXFaLzsHv4KqZnckX0gP1AYFUr5X\n",
+              "3Zof5zltHp7OQG87KhkyMuJLOz4diYjf3ctsH2KA3/S29L1hP4qjZ9kfgNEsjrH/nSlX3ikiiFcQ\n",
+              "/2mu5vwlzQMTIUj5/0pAslvbULpI2rwxcgfjtpeW3qe/Q0sCZXyJ3L7VhEaeyKZo/ALUAi114xdn\n",
+              "Gao6fyKpZhWohGCsI53i8XO3Y7Dq+aD4ONx4A265BL770fTZiNNw+oM7dwTK1vcPMdOTVjz4fi6j\n",
+              "bCMBPzMCGM7CsAz7OQTIKiUTlOi8YAAAAakBn7V0Q38AeTG7snd+wR+ioRwfka+slSBm7w4HiigA\n",
+              "mYoe7RzT8waKJhe/5/xyHdk2lI4Qb6yur2vWdYx/k/gVzZWx+dAAALHLM2W5kE06MD+/WY8W9vMg\n",
+              "jgsWx+NCob+sUo3r0m3kC7Z6vE5pa/kp8NVK1XizBU/gSaY6/S/NP+nzZeAUHhvnb6LPnQnTmhI7\n",
+              "+CLAa1UiK6P+lwPbKP0S0Q5RWiopmhls/AKTmwxXB+WRWyrrFglLMCCi/H7yBlZCPn3f1nUi1WXW\n",
+              "txmtCNftDVTPLfu3fbw+YSszpG0LQoe/d+Hn14JtNEXcVveVKgdRtrJ2SZSzkDZoD5uTokEopKbG\n",
+              "geSmsxJSe6mDenK/tstnSjFiozTKWgyJb1mTK9iBWStV+uPeceDypkgatRgkwgz17Zgn457UL8xo\n",
+              "RIb3Rzvhn1PaM6KKHv4wQMqvpqRXKRm+SScKgBhgUzc706tHx+sk3QXrFbfmTj3VwEqpASdMV8SQ\n",
+              "Rc7Pl7VdiwexHM38nPcgZguGyvH4NF1CZay1mT9d+wee9MfU3VHZJgMp057sUGFJIJZNmQAAASYB\n",
+              "n7dqQ38Ah1fDGltbSoFNBABy4LNfpqaOuQiA03rsvInHR01iNZMDGQE2sq9jRvjWYcCsjv8TgHDx\n",
+              "TelM9UgK8aIkbW5xZBO7YH31DMzHB/HcoCKmBUni45/7i/CIo8gF1pGPr0DAA7wV6D09MIgWLTIz\n",
+              "u2RlgzWHXLOhQSqpesq6gEgghz4eO+szzJWiaji2cgnbFYV7gS1iXMpBIisJc8i3U9gywhFgtGxt\n",
+              "IPW/7TiYEwGOLwxyjZX1HkROuSI8lAAdZBpungwbYVpPKSngzu3PnOIcBqes7c29MHD8jRPn7Zrt\n",
+              "720E/jZ4jB2yT62h5AEs+TCYeJmiY6lwGwXm58hIVqeMFafCwAYhd3vDCtfE6mymrvYwtLYQ0YeE\n",
+              "Ebj2MbA5+zEAAAFwQZu6SahBbJlMFEwR//61KoAWx89GABUe1i4OfaowcQHQyqHCv9PnwkHOB5jh\n",
+              "ZaY1nqaJvfgMHLxnx0HRU319XsFiIgZ3fycxZ7MoTbod+V6rFy2y2Qtld8RvCt0Ug4PVQuLFLU9x\n",
+              "N6gbeWntqj92UVkXYHO8rtnoyHbc5vkyDRwK85+1rEknOmV2fCPAJQWJQHZKzqn/akJ6R91HlWya\n",
+              "u/8GgP8q7KTtX0XyZMALsB3jT/UhmW5AlGIwNHeW1rtDiMG/Xy+69i+m2kTOjww4y5o0/8WfwLLR\n",
+              "RKlhEE1LYjJQjoy3+hNy7YguxzdtR0GOg0UsPQLFZIBnnCwGmFharg9MSkzKoZck80tBnNzVcu5F\n",
+              "Ot8W+bdDLv2E/9UTXci1RXlM26z5jearPa/9d/CciU6kElsImbzJ5J2YpzVs+pvW89XbvAJMExZq\n",
+              "wXD26iUkefzti1p2cc2CbM5qN5CGCTCmR13du1Y9J/JQwXkxhEAAAAFiAZ/ZakN/AHwUpp6Dymc0\n",
+              "2L536BR5shJlFypABdlGcrzfdaw/6f5GB/atQKmEnLjISTsAvG6zfbdBMs7bm2yeFrIQxXuK81kC\n",
+              "9pAAAXcBlvswH72knWeKBsU0Ht1g5h3YcKtQv4e82ah693wXobc+mdHgPA3TBKIFWUv/iM+/E90G\n",
+              "S/NmTeZC+lgt/zT/+HMt/QSFK9C1+AMdH9l6Wmy5eJzA8pumBNuqAArwclv8LW1AC9Ryj7J7dIqZ\n",
+              "2nhKIYQ08cavMFAGExrDHt7RiTs4Auer+jpijDT1MWhCFcQjNZn9nbOp1MdYUZ3batlHR94YKH39\n",
+              "SB9iaEe1H+vDrSDRsP3b0PfVLevCUtQQ7tTMju5YxLigI0SkXHby6oMGwH35DOmYdZ/QEHihEbbH\n",
+              "ljlaWypqm6TR7b/zNBCPoaZiHS0IlbTr/gzMbXxGasP7GssB89XtUV2jZihKJYcij8456L2VAAAC\n",
+              "WkGb3knhClJlMCCH//6qVQAvW48vGhnpxPcAFRvWsRQfCH0ZQNKlkI/Fmy/VFBZqjdqwlFWyRDRU\n",
+              "ATa/x8nSCThm/LYIboN0iejGj3Uchm8nyLv3P3+HOOnCw7+XGsyycSpaT/SKI8hu4RwjrdDxqaYn\n",
+              "k6pZ6qjZtX+IZ04XS8X44piBkZKHHklQnddyez3eJG0JjT0fN5b/c72jAD+sOeXlR6iPKkSUzu0o\n",
+              "3ha2oHN6UEDmISbP1cbB3piI/SHrisHlFNjIuHiEdkqSzG95tlcEE5RmJMFHyIZtmV+VUnHUg//H\n",
+              "WOVjyT0+oFlaS4c8th8dtoQJgchjo9u+OPpSDxEJgWI6zeeh28ogNTGzlwRqjfRSsrTItvjA1MD/\n",
+              "oBFhKLk5Gm5LLSkMpDHu9T5I2IaoH3PKDFRJp5FswrHAqK+C6EMiKJRw3UfQ++e71IzTL0xpDNJL\n",
+              "z6AeitOHT7WHH1q0lcaxtRKIXyzlri2FOeAU+zEh7DbcM3wvbzCPYrbD4ePmP1flYALif0DM+F20\n",
+              "woqO1ciEp6KvfcdLwkVhOi6HukmunTXGsruYaqjkaLT2QlUIMJVPTAaXGvEAsJSG/0vfsDXKkk6Z\n",
+              "sB3ElNrSO3yHej1aIEgW5xnCNisEQsWn6TKnOYGilPN4ZN8EB64V0F8PWNB9Aq0baX+T8kKesmFw\n",
+              "2y/668NRP8ypn4s+0TEew3V5nLH+An+XxWolypflMoVnWhEhG2W+IIgxfWfPuSgDmqBKtSemnfnO\n",
+              "mj2z1HJ4yEmqNoBjJwYnWfK8e0PHHb381Mk1zGGJOgWAAAABUEGf/EU0TDv/AFlVerlP4Rak+BQA\n",
+              "rfH1MAekqKZtO9rI3YpPu0XbIusXd4D2mikBBjNWCs5ZCx1/nIkAW78LpHSyCScRX686DgqeELvg\n",
+              "+6gjEvz9oPv/Q5SyPMBeMNrb/QJ3ato+Qw19nLJWjl0bduh+HilMsrklIYKHCWBaC/dNC4s7Xl/r\n",
+              "RCzM7ZJuRKmUY/D5sEAdr/H6TIVmiD0u2jiehC8y8Gw6flB5fdlWyz5ArpMes88RS9cHH1n4Dp5A\n",
+              "9YiKoxa6XsjMVtwy/Q1CE1CcjEE8nX1x2wi3FF+AiuFwqQsSRlHtfUsVksDBdXLvE8zjbyOIuIMV\n",
+              "pnJU22cEHHqRAVAAAQz/a8I3JUwtCYefKDlHQuITIdlhxtkj1S9/MOKY0At1R1tnioLMWN7HUVCo\n",
+              "b6XS9uoGwS6oOJgKcTFbR1vNa4wchWq0XCPds0DBwQAAAPYBnht0Q38AeTSjvudgsbkOLNHOwJSE\n",
+              "7MIAOT4Tae/DlzyAOhFcKHSt+XmND2K3krM1WAe1ksxoXOx8R5ib25iI4yoXHAvjcPvcDoLvQIYy\n",
+              "rfzkEj8FCsgVqTty2M7mcrrsvBMmGI/tSEAq1Wpq/wSUg2I4oZj0GjiChzewD+uw3YnWAi/Ntf5Y\n",
+              "Cv2dU9qEo9e3jPCavhxnj6HVQyqcvxekJ6cEcAGQvRh8PwiQyys4LYMz+Th6jmnZO6zDQlY1h459\n",
+              "aXiX/1NPDVjhvbOibPxdXy1nW8ZFN/ZpmMtUtTAz4mvuGfLCJYTZv8r0n1cztBPRieehovEAAAGy\n",
+              "AZ4dakN/AHwTrqiSAEDVZr7cfUIfCi6SEtf6z4BBmn/qEvCbGFYoG0hJzipIIEfgPxGLOPb5hgYo\n",
+              "3EqlxYfhyi3ADlPB0rSvUe/2K1c1bOHHkBdbN7v2fRCe6cTgBUViIyBzKbW8+YVzs1NjLsftvDLF\n",
+              "Jws+AVbFUOsz2XZO6+tJqS4okplORVfI8Zh8pjE7ly6+HI7Omo301kEp6VZks8VHiVKJOuTRsuFe\n",
+              "1lak9cDIgZS7IV3MkEjdmu8V6wPVTOui5KhgRegdKpe7dvKwiZROacSHUyEpgoiQ49NAkgd9ICSC\n",
+              "nOG96XtcVUK5qLGXI1ECEXtJcuaFVMtCmmOBBiFL8jC1MpHbxQ+4k2qRSUjP3JvFi0NfrsxeXbrH\n",
+              "Ebg5vBmNpJE6T+wdC73c70xC+Mtp+wYFzu5kfTKcL8d+Nzu4GlIr338e6SWwNSpXRGjfdLp9o3Ic\n",
+              "2PzMtQmrlpbEeUDp1vnkaZoqSF5M9xanIk/zohgoPX5++NN/ebYvr56WROjUeIUdsOf6nrJlmboT\n",
+              "DZEat6r4aY15lVCgiz4Mpb/mqSazxzrszmdRYRxGsW8DnzAAAAHfQZoCSahBaJlMCHf//qmWALFy\n",
+              "5oM61QiAB+cxK4+jNCOHXw6RALujtnWF0llKsvjvaSIz+44BdTBn8Dqmduydu0Ab2yYLL8rBa9BR\n",
+              "bM/WBrO6FCt4pfpaT57HiAbORTevnWHgnUCdwsiqbddvhjkiuJYbgCMD0kEP1SURu/b2Z5hWsq5s\n",
+              "eIdJwlVUmffx/GFsHH2OVg2kldaudIzyWEsMXsnZccvZ4+1TTMECSDKdUtlhUW9AAgPUraaePKP1\n",
+              "hatMAsKbsEP5g1nzjTlmyHjs7FjRbwjKng4/qsqVQ+s9Z8Le9mq44VPerxrlkKxdRgf8PQXTEpxP\n",
+              "gMR8UP9I/vRSJBbzTafYsMhPytfC8ESUe9ySga0pNZKSvC+bN1h7zO9OEjqF3rsnXJU2SZN7NAbS\n",
+              "01WCPkWQIdWN39TZ8BwhuM2E1/XfXA9OxCI/7PAG40Z8M1rKVJPTY+iwZnIQA6cEF3rnJVasn/JZ\n",
+              "rircnzzi1JQr5NiwthCEkD02k7GAoyHtF8lIKArvw+GqH7Ox1Tpd6DhPPJm2hmyijeFH6E+9UCJk\n",
+              "Iiolc9K3UW1rmUlHlF/p9jHAvsiiJUpuG/KCfna2LEYj9yn6P2oNlWfqq5P2HNtctaJeVRZv9Qb/\n",
+              "mNVjyjAAAAErQZ4gRREsO/8AZUEtk8LzOoS4AAhIFC88oI10PfUAs3UxxCOOtSzHREgn4/jgVfHt\n",
+              "0r483Tf2Y8D+zGlycQw2lUV6Nidlo0k0sASUCm4dEwF8Hb0+IzseFE0dYexJdLqvhcI7IIUIH6RG\n",
+              "uv8cjTXFD8CTksvYGpGc+uBYXhlwc3/jHhNGtm8G24uHniey+Zy/NtEpSl5dub3bE324kx+/N1gF\n",
+              "sU/CxkQF6UQWvd6Br4nL+i2L6udCLqM/JAVJhScc01UR/bE+NX2i3upx0qofgxfWL8unNZ/BP9Vc\n",
+              "CvVXAtxPw+0JopAnWMlwtBFG9wd+oP4zOIJ88u/VEvyZQd0JJP1Y3qhYk13Deyiv0C1r6ci1z7CQ\n",
+              "UwYqgUT64pT/hlIvHeCzEZxqH+WbUbEAAAGYAZ5fdEN/AIteE+hbrZmAAHNd3/IVGxTYP4E6C+Wr\n",
+              "63le3xAHjzqOqEil1tIAAUY3LvF62/277H30QskV8sEjceHvPe7bE0mfZ44avBY2gS0AAAMByRDk\n",
+              "EKOyh31Y2H0mdsy+zcGsPrGm3pHtO2riBcgILxHO0F5398HG90hK8UgtDUfp9CQyPOvDSyEU4WTb\n",
+              "6/WT9Z3aca6tb4C53W6p8Geyjq/mwbvNpnCVbbqIcx1ZT2+dencovmeYmPlI7jrhk6KwLYEd+5gO\n",
+              "J2YeKk4iWai6BsaO9+Tb5P52jBVHcSZ+Vws5QhTxkBSpdHlWJRcbh50V4ViVltwUN//XNx+jx2bk\n",
+              "KsfglI41FGmS2xAJtr8ZhKDk1VRRL2tGsNB5nztuRXCFd8q4MIuVVWGjim0ntcxZ/R18mzJZN+sI\n",
+              "qKUvfsxoaeZp+oIaU1hLeXzgcHEe+3/6emdZeJWoDNhUqhkfWzWzVZbEzUKpDBS9AbVIA5KR27LD\n",
+              "3HEfRMw9yt8eYILg7m/Rm2ubtU8u6V2QuxVXq1OHry5oY2TAAAABvQGeQWpDfwCPV5unds/RGF4o\n",
+              "aWlq+XwTSVpG+igacFOApaqyNJIXSXT4q7gA4DkP0YAYAumNCN0MwD7HSEeIsv3Q3L9kZ2RagxvU\n",
+              "jle4yQq6Zl5W7AgdlZnaBngH/w8xYsqWx5t90zzi7s9VyRY9jaNshfxuJAZcRgFILNTmQNCPoCtl\n",
+              "wyo5Ht91VCy2qSby6JDLeTD096PzM4KOK7/I+amuefuT0S/QnDNs952oi11JV2mbadqtKDqJE9x4\n",
+              "nX/OjU9PBP1uhsFLNkjsz6ZHlTOcsZvWUxabbw0HBNFuLXWIYqtAYdWN7c/QUoqY2IlVBR//v+NN\n",
+              "Bxf/rxPv+9QlTTeUOAVhzyU/kQACorW+VEL2KFNUPF85LUxlbSGEYQv/98/fAQAu6hKRw3yoJoPy\n",
+              "tyr7S7Za9gGurMYseuvuasNoB+fPCmp37VWgm4yNZQ0LM+8CPtaQgShVMs2/RIG2cXksHuYVqEB7\n",
+              "PJtzP2tl8EYDen8RohIb2UO5d/Xdc8aoi/Nu4IzGq8ApuZIxjC5J9bUYtMDEDA6eChGKPjb20vqg\n",
+              "2PRBI2fSXJrcSROGTC4m+VsF+VagO1LnjrakndEAAAHtQZpDSahBbJlMCG///qeEAVH55ayIAL6z\n",
+              "9D9Go2JR/VsPgULYIy+HM1JNQWUio64eqKV59gHDbxQ77xKGvVi/RlMeepNHF+Cplpp4rKqgivaK\n",
+              "14o0jVVjKwdzXmYfm8QJck76NrSj9rXzMi3Th9DbQ5HQHvlFr1+Ft6fGVXaubVoF+Bx3J4nvsWO+\n",
+              "FhXDphKaWh9geM/3PqX1TK4zqhRL2wKgDCWdLvIi2s2e48RSWR1zksj0SjkMINJfgjA7wVj0dW8Z\n",
+              "NZGlcRPjgkoSgpomI+x9/l7dJ5fHEj4WOkMQMTJnj+KOqaXfgtXbhBachZ0Av1Z6rh+qw/iObJOy\n",
+              "7q2gUdlftEWI7In7KZjqqg18Bg+z35wI2FmknOyXdEiDAPaFiRrhqkKOLfgLssw1BdohiuTGWlKn\n",
+              "NvPL4EzIbAUeS+0qv5cFdXvRjnn1zOMYTMpyN1CZYg4pqjj8mGtGdm1F7w0Xo4Mnm3hRmvZyyOaW\n",
+              "yf38s1SCwyOkhQcwJhrAAebvkxMWrAUWrTq9K9PdCUqFbMVB9+93aovoux8zBfM/WLangtLLXd/D\n",
+              "T9TcgY0eosWGZeAhQk2sxNC3bgvMT328AT2T2XCg2nG4jsOakPWfscwbc0zKfItj/1eXvyR2tk+K\n",
+              "fpgdg9dJ/OdcXINTUAAAB95tb292AAAAbG12aGQAAAAAAAAAAAAAAAAAAAPoAAAnEAABAAABAAAA\n",
+              "AAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAAAAAAAAACAAAHCHRyYWsAAABcdGtoZAAAAAMAAAAAAAAAAAAAAAEAAAAAAAAnEAAA\n",
+              "AAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAABsAAAASAA\n",
+              "AAAAACRlZHRzAAAAHGVsc3QAAAAAAAAAAQAAJxAAAAgAAAEAAAAABoBtZGlhAAAAIG1kaGQAAAAA\n",
+              "AAAAAAAAAAAAACgAAAGQAFXEAAAAAAAtaGRscgAAAAAAAAAAdmlkZQAAAAAAAAAAAAAAAFZpZGVv\n",
+              "SGFuZGxlcgAAAAYrbWluZgAAABR2bWhkAAAAAQAAAAAAAAAAAAAAJGRpbmYAAAAcZHJlZgAAAAAA\n",
+              "AAABAAAADHVybCAAAAABAAAF63N0YmwAAACzc3RzZAAAAAAAAAABAAAAo2F2YzEAAAAAAAAAAQAA\n",
+              "AAAAAAAAAAAAAAAAAAABsAEgAEgAAABIAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n",
+              "AAAAAAAAAAAY//8AAAAxYXZjQwFkABX/4QAYZ2QAFazZQbCWhAAAAwAEAAADAFA8WLZYAQAGaOvj\n",
+              "yyLAAAAAHHV1aWRraEDyXyRPxbo5pRvPAyPzAAAAAAAAABhzdHRzAAAAAAAAAAEAAABkAAAEAAAA\n",
+              "ABRzdHNzAAAAAAAAAAEAAAABAAADMGN0dHMAAAAAAAAAZAAAAAEAAAgAAAAAAQAAFAAAAAABAAAI\n",
+              "AAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQA\n",
+              "AAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAA\n",
+              "AAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAA\n",
+              "AAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n",
+              "AQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAAB\n",
+              "AAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEA\n",
+              "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAA\n",
+              "CAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAM\n",
+              "AAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgA\n",
+              "AAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAA\n",
+              "AAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAA\n",
+              "AAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAA\n",
+              "AQAABAAAAAABAAAMAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n",
+              "AAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAgAAAAAHHN0c2MAAAAAAAAAAQAAAAEA\n",
+              "AABkAAAAAQAAAaRzdHN6AAAAAAAAAAAAAABkAAAGhgAAAl8AAAFjAAAAvgAAAXYAAAHzAAABDgAA\n",
+              "ATYAAAFIAAAB9QAAAOIAAAD6AAABWgAAAbAAAADTAAAB8wAAAN4AAAH+AAABEAAAAOIAAAG2AAAC\n",
+              "DAAAAWUAAAGkAAABmgAAAckAAAEdAAABfQAAAPMAAAFxAAABIgAAAjYAAAEmAAAA5AAAAXoAAAH+\n",
+              "AAAA/wAAAT0AAAFnAAACAwAAARQAAAE3AAABTwAAAckAAADrAAACFwAAAP0AAAHzAAABIQAAAOAA\n",
+              "AAHKAAACOwAAAVQAAAHFAAABugAAAdQAAAD3AAABUgAAARIAAAFuAAABLwAAAhAAAAERAAAA9gAA\n",
+              "AZkAAAIqAAABIgAAAV0AAAGIAAACSgAAASgAAAFEAAABggAAAegAAAD+AAACCgAAASIAAAIdAAAB\n",
+              "KAAAAQcAAAHbAAACFgAAAT0AAAITAAAB2gAAAi8AAAEGAAABrQAAASoAAAF0AAABZgAAAl4AAAFU\n",
+              "AAAA+gAAAbYAAAHjAAABLwAAAZwAAAHBAAAB8QAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEA\n",
+              "AABabWV0YQAAAAAAAAAhaGRscgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWp\n",
+              "dG9vAAAAHWRhdGEAAAABAAAAAExhdmY1Ny44My4xMDA=\n",
+              "\"\u003e\n",
+              "  Your browser does not support the video tag.\n",
+              "\u003c/video\u003e"
+            ],
+            "text/plain": [
+              "\u003cIPython.core.display.HTML at 0x7f84b2253b50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "import time\n",
+        "import traceback\n",
+        "\n",
+        "from matplotlib import pyplot as plt\n",
+        "from matplotlib import animation as anim\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "from IPython import display\n",
+        "\n",
+        "\n",
+        "@ag.do_not_convert(ag.RunMode.PY_FUNC)\n",
+        "def render(boards):\n",
+        "  fig = plt.figure()\n",
+        "\n",
+        "  ims = []\n",
+        "  for b in boards:\n",
+        "    im = plt.imshow(b, interpolation='none')\n",
+        "    im.axes.get_xaxis().set_visible(False)\n",
+        "    im.axes.get_yaxis().set_visible(False)\n",
+        "    ims.append([im])\n",
+        "\n",
+        "  try:\n",
+        "    ani = anim.ArtistAnimation(\n",
+        "        fig, ims, interval=100, blit=True, repeat_delay=5000)\n",
+        "    plt.close()\n",
+        "\n",
+        "    display.display(display.HTML(ani.to_html5_video()))\n",
+        "  except RuntimeError:\n",
+        "    print('Coult not render animation:')\n",
+        "    traceback.print_exc()\n",
+        "\n",
+        "\n",
+        "def gol_episode(board):\n",
+        "  directions = tf.constant(\n",
+        "      ((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)))\n",
+        "\n",
+        "  new_board = []\n",
+        "  ag.set_element_type(new_board, tf.int32)\n",
+        "\n",
+        "  for i in range(len(board)):\n",
+        "    for j in range(len(board[i])):\n",
+        "      num_neighbors = 0\n",
+        "      for d in directions:\n",
+        "        ni = i + d[0]\n",
+        "        nj = j + d[1]\n",
+        "        if ni \u003e= 0 and nj \u003e= 0 and ni \u003c len(board) and nj \u003c len(board[i]):\n",
+        "          num_neighbors += board[ni][nj]\n",
+        "      \n",
+        "      new_cell = 0\n",
+        "      if num_neighbors == 2:\n",
+        "        new_cell = board[i][j]\n",
+        "      elif num_neighbors == 3:\n",
+        "        new_cell = 1\n",
+        "      \n",
+        "      new_board.append(new_cell)\n",
+        "  final_board = ag.stack(new_board)\n",
+        "  final_board = tf.reshape(final_board, board.shape)\n",
+        "  return final_board\n",
+        "  \n",
+        "\n",
+        "def gol(initial_board):\n",
+        "  board = initial_board\n",
+        "  boards = []\n",
+        "  ag.set_element_type(boards, tf.int32)\n",
+        "  # We are being explicit about tensor constants to ensure the loop\n",
+        "  # is not unrolled in the graph. This may change in the future.\n",
+        "  for i in range(tf.constant(NUM_STEPS)):\n",
+        "    board = gol_episode(board)\n",
+        "    boards.append(board)\n",
+        "  boards = ag.stack(boards)\n",
+        "  render(boards)\n",
+        "  return tf.no_op()\n",
+        " \n",
+        "\n",
+        "with tf.Graph().as_default():\n",
+        "  # Gosper glider gun\n",
+        "  # Adapted from http://www.cplusplus.com/forum/lounge/75168/\n",
+        "  _ = 0\n",
+        "  initial_board = tf.constant((\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n",
+        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "      ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n",
+        "  ))\n",
+        "  initial_board = tf.pad(initial_board, ((0, 20), (0, 10)))\n",
+        "  \n",
+        "  tf_gol = ag.to_graph(gol)\n",
+        "  game_ops = tf_gol(initial_board)\n",
+        "  with tf.Session() as sess:\n",
+        "    sess.run(game_ops)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "7NgrSPCZxs3h"
+      },
+      "source": [
+        "#### Generated code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 2323
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 753,
+          "status": "ok",
+          "timestamp": 1532101593840,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "hIGYeX0Cxs3i",
+        "outputId": "e0b62eb1-3e12-4e53-dc54-8a3fa56d823d"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "from __future__ import print_function\n",
+            "import tensorflow as tf\n",
+            "\n",
+            "def tf__gol_episode(board):\n",
+            "  try:\n",
+            "    with tf.name_scope('gol_episode'):\n",
+            "      directions = tf.constant(((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1),\n",
+            "          (1, -1), (1, 0), (1, 1)))\n",
+            "      new_board = ag__.new_list([])\n",
+            "\n",
+            "      def extra_test_2(new_board_2):\n",
+            "        with tf.name_scope('extra_test_2'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body_2(i, new_board_2):\n",
+            "        with tf.name_scope('loop_body_2'):\n",
+            "\n",
+            "          def extra_test_1(new_board_1):\n",
+            "            with tf.name_scope('extra_test_1'):\n",
+            "              return True\n",
+            "\n",
+            "          def loop_body_1(j, new_board_1):\n",
+            "            with tf.name_scope('loop_body_1'):\n",
+            "              num_neighbors = 0\n",
+            "\n",
+            "              def extra_test(num_neighbors_2):\n",
+            "                with tf.name_scope('extra_test'):\n",
+            "                  return True\n",
+            "\n",
+            "              def loop_body(d, num_neighbors_2):\n",
+            "                with tf.name_scope('loop_body'):\n",
+            "                  ni = i + ag__.get_item(d, (0), opts=ag__.GetItemOpts(\n",
+            "                      element_dtype=None))\n",
+            "                  nj = j + ag__.get_item(d, (1), opts=ag__.GetItemOpts(\n",
+            "                      element_dtype=None))\n",
+            "\n",
+            "                  def if_true():\n",
+            "                    with tf.name_scope('if_true'):\n",
+            "                      num_neighbors_1, = num_neighbors_2,\n",
+            "                      num_neighbors_1 += ag__.get_item(ag__.get_item(board,\n",
+            "                          (ni), opts=ag__.GetItemOpts(element_dtype=None)),\n",
+            "                          (nj), opts=ag__.GetItemOpts(element_dtype=None))\n",
+            "                      return num_neighbors_1,\n",
+            "\n",
+            "                  def if_false():\n",
+            "                    with tf.name_scope('if_false'):\n",
+            "                      return num_neighbors_2,\n",
+            "                  num_neighbors_2 = ag__.utils.run_cond(tf.logical_and(tf.\n",
+            "                      greater_equal(ni, 0), tf.logical_and(tf.greater_equal\n",
+            "                      (nj, 0), tf.logical_and(tf.less(ni, ag__.utils.\n",
+            "                      dynamic_builtin(len, board)), tf.less(nj, ag__.utils.\n",
+            "                      dynamic_builtin(len, ag__.get_item(board, (i), opts=\n",
+            "                      ag__.GetItemOpts(element_dtype=None))))))), if_true,\n",
+            "                      if_false)\n",
+            "                  return num_neighbors_2,\n",
+            "              num_neighbors = ag__.for_stmt(directions, extra_test,\n",
+            "                  loop_body, (num_neighbors,))\n",
+            "              new_cell = 0\n",
+            "\n",
+            "              def if_true_2():\n",
+            "                with tf.name_scope('if_true_2'):\n",
+            "                  new_cell_2, = new_cell,\n",
+            "                  new_cell_2 = ag__.get_item(ag__.get_item(board, (i), opts\n",
+            "                      =ag__.GetItemOpts(element_dtype=None)), (j), opts=\n",
+            "                      ag__.GetItemOpts(element_dtype=None))\n",
+            "                  return new_cell_2,\n",
+            "\n",
+            "              def if_false_2():\n",
+            "                with tf.name_scope('if_false_2'):\n",
+            "                  new_cell_3, = new_cell,\n",
+            "\n",
+            "                  def if_true_1():\n",
+            "                    with tf.name_scope('if_true_1'):\n",
+            "                      new_cell_1, = new_cell_3,\n",
+            "                      new_cell_1 = 1\n",
+            "                      return new_cell_1,\n",
+            "\n",
+            "                  def if_false_1():\n",
+            "                    with tf.name_scope('if_false_1'):\n",
+            "                      return new_cell_3,\n",
+            "                  new_cell_3 = ag__.utils.run_cond(tf.equal(num_neighbors, \n",
+            "                      3), if_true_1, if_false_1)\n",
+            "                  return new_cell_3,\n",
+            "              new_cell = ag__.utils.run_cond(tf.equal(num_neighbors, 2),\n",
+            "                  if_true_2, if_false_2)\n",
+            "              new_board_1 = ag__.list_append(new_board_1, new_cell)\n",
+            "              return new_board_1,\n",
+            "          new_board_2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range,\n",
+            "              ag__.utils.dynamic_builtin(len, ag__.get_item(board, (i),\n",
+            "              opts=ag__.GetItemOpts(element_dtype=None)))), extra_test_1,\n",
+            "              loop_body_1, (new_board_2,))\n",
+            "          return new_board_2,\n",
+            "      new_board = ag__.for_stmt(ag__.utils.dynamic_builtin(range, ag__.\n",
+            "          utils.dynamic_builtin(len, board)), extra_test_2, loop_body_2, (\n",
+            "          new_board,))\n",
+            "      final_board = ag__.list_stack(new_board, opts=ag__.ListStackOpts(\n",
+            "          element_dtype=tf.int32, original_call=ag.stack))\n",
+            "      final_board = tf.reshape(final_board, board.shape)\n",
+            "      return final_board\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n",
+            "def tf__gol(initial_board):\n",
+            "  try:\n",
+            "    with tf.name_scope('gol'):\n",
+            "      board = initial_board\n",
+            "      boards = ag__.new_list([])\n",
+            "\n",
+            "      def extra_test(board_1, boards_1):\n",
+            "        with tf.name_scope('extra_test'):\n",
+            "          return True\n",
+            "\n",
+            "      def loop_body(i, board_1, boards_1):\n",
+            "        with tf.name_scope('loop_body'):\n",
+            "          board_1 = tf__gol_episode(board_1)\n",
+            "          boards_1 = ag__.list_append(boards_1, board_1)\n",
+            "          return board_1, boards_1\n",
+            "      board, boards = ag__.for_stmt(ag__.utils.dynamic_builtin(range, tf.\n",
+            "          constant(NUM_STEPS)), extra_test, loop_body, (board, boards))\n",
+            "      boards = ag__.list_stack(boards, opts=ag__.ListStackOpts(\n",
+            "          element_dtype=tf.int32, original_call=ag.stack))\n",
+            "      with ag__.utils.control_dependency_on_returns(render(boards)):\n",
+            "        boards_2 = ag__.utils.alias_tensors(boards)\n",
+            "        return tf.no_op()\n",
+            "  except:\n",
+            "    ag__.rewrite_graph_construction_error(ag_source_map__)\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "print(ag.to_code(gol))"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [
+        "p8zZyj-tq4K3",
+        "Lkq3DBGOv3fA",
+        "r8_0ioEuAI-a",
+        "7NgrSPCZxs3h"
+      ],
+      "default_view": {},
+      "last_runtime": {
+        "build_target": "",
+        "kind": "local"
+      },
+      "name": "Simple algorithms using AutoGraph",
+      "provenance": [
+        {
+          "file_id": "19q8KdVF8Cb_fDd13i-WDOG_6n_QGNW5-",
+          "timestamp": 1528465909719
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
index 86e38c3490a52e18131dc1816c636fbe132cea7c..7e9cc54d4cafa64e4cd3b48f9376b1b2b4d3575e 100644
--- a/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
+++ b/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb
@@ -130,7 +130,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -201,7 +201,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -275,7 +275,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -362,7 +362,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -392,7 +392,7 @@
           "output_type": "stream",
           "text": [
             "Got error message: assertion failed: [Do not pass zero!]\n",
-            "\t [[Node: f/Assert/Assert = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
+            "\t [[{{node f/Assert/Assert}} = Assert[T=[DT_STRING], summarize=3, _device=\"/job:localhost/replica:0/task:0/device:CPU:0\"](f/NotEqual, f/Assert/Assert/data_0)]]\n"
           ]
         }
       ],
@@ -467,7 +467,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 9,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -531,7 +531,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 0,
       "metadata": {
         "cellView": "code",
         "colab": {
@@ -896,7 +896,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 17,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -1070,8 +1070,8 @@
         "  return dataset\n",
         "\n",
         "\n",
-        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/train.csv\"\n",
-        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/extras/colorbot/data/test.csv\"\n",
+        "train_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/train.csv\"\n",
+        "test_url = \"https://raw.githubusercontent.com/random-forests/tensorflow-workshop/master/archive/extras/colorbot/data/test.csv\"\n",
         "data_dir = \"tmp/rnn/data\""
       ]
     },
@@ -1304,7 +1304,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 22,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -1905,10 +1905,6 @@
     "colab": {
       "collapsed_sections": [],
       "default_view": {},
-      "last_runtime": {
-        "build_target": "",
-        "kind": "local"
-      },
       "name": "Dev Summit 2018 - Autograph",
       "provenance": [
         {
diff --git a/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb b/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..32742bec7ee4a412aabb6640b5a1329353ebfc9d
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/notebooks/graph_vs_ag_vs_eager_sum_speed_test.ipynb
@@ -0,0 +1,519 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "moMkWaT_TTHi"
+      },
+      "source": [
+        "This Colab illustrates the differing overhead* between a custom, vectorized graph operation and a loop over a tensor\n",
+        "that computes the same function. The loop is implemented in TensorFlow Eager mode using Python syntax and control-flow, and using AutoGraph which takes a python function and converts it into graph mode. In AutoGraph the Python loop is converted into a tf.while_loop.\n",
+        "\n",
+        "The actual computation, summing a small number of scalar values, takes very little time to compute, so the graphs below are showing the overhead of the differing approaches. As such, this is more of a \"micro-benchmark\" than a representation of real-world performance of the three approaches.\n",
+        "\n",
+        "*Note the differing scales of the included plots"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "a0X_rfvuav98"
+      },
+      "source": [
+        "### Imports"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "EdxWv4Vn0ync"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U -q tf-nightly"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "erq3_S7QsjkU"
+      },
+      "outputs": [],
+      "source": [
+        "from __future__ import absolute_import\n",
+        "from __future__ import division\n",
+        "from __future__ import print_function\n",
+        "\n",
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "import matplotlib.pyplot as plt\n",
+        "import math\n",
+        "import time\n",
+        "import random\n",
+        "from colabtools import adhoc_import\n",
+        "from tensorflow.contrib import autograph as ag\n",
+        "from tensorflow.python.framework import function"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "1JgnsXooa2RP"
+      },
+      "source": [
+        "### Testing boilerplate"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "UyD5LLjVZzny"
+      },
+      "outputs": [],
+      "source": [
+        "# Test-only parameters. Test checks successful completion not correctness. \n",
+        "burn_ins = 1\n",
+        "trials = 1\n",
+        "batches = 2\n",
+        "max_elements = 2"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4_NBL0RQa8gY"
+      },
+      "source": [
+        "### Speed comparison parameters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Yq6daecyiJV5"
+      },
+      "outputs": [],
+      "source": [
+        "#@test {\"skip\": true} \n",
+        "burn_ins = 3 # Batches not counted in the average\n",
+        "trials = 10 # Batches run per vector-size (and averaged)\n",
+        "batches = 1000 # Number of random vectors summed over per trial\n",
+        "max_elements = 100 # Vectors of size 0 to this-1 will be executed and plotted"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "fiR8m13CbKH2"
+      },
+      "source": [
+        "### Random input"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "d8vrTlyNXuxc"
+      },
+      "outputs": [],
+      "source": [
+        "# Construct a random num x 1 tensor\n",
+        "def get_elements(num):\n",
+        "  return tf.random_uniform(shape=(num, 1), maxval=1)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "ILJ6SbF3bXFQ"
+      },
+      "source": [
+        "## Graph mode"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "vovRf597X55n"
+      },
+      "outputs": [],
+      "source": [
+        "def tf_sum(elements):\n",
+        "  # Using custom vectorized op\n",
+        "  return tf.reduce_sum(elements)\n",
+        "\n",
+        "def run_trial(num):\n",
+        "  elements = get_elements(num)\n",
+        "  return tf_sum(elements)\n",
+        "\n",
+        "\n",
+        "\n",
+        "graph_means = []\n",
+        "for num in range(max_elements):\n",
+        "  with tf.Graph().as_default():\n",
+        "    durations = []\n",
+        "    foo = run_trial(num)\n",
+        "  \n",
+        "    with tf.Session() as sess:\n",
+        "      \n",
+        "      for _ in range(burn_ins):\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "      for _ in range(trials):\n",
+        "      \n",
+        "        start = time.time()\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "        duration = time.time() - start\n",
+        "        durations.append(duration)    \n",
+        "      \n",
+        "    graph_means.append(np.mean(durations))  "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 278,
+          "status": "ok",
+          "timestamp": 1532447361278,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "Jm9Blkyx90Eq",
+        "outputId": "d83cd51f-7e56-4d73-f7df-bb157dee46df"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAa8AAAEcCAYAAABwNTvaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3WdgVFXegPFnZtI7kJ5QQwlCKIGERGroEoSAYZFVEFAR\ngV1XXHvbFWEtK6xlUVgRXMuLiqBSZQUh9E5ChxRIn/ReJjNz3g8hNxkzCUMJkHh+X2DmtnPPnXv+\n95R7ohJCCCRJkiSpGVHf6QRIkiRJ0vWSwUuSJElqdmTwkiRJkpodGbwkSZKkZkcGL0mSJKnZkcFL\nkiRJanaaJHitWLGCV199tSl23WINHz6cAwcONPlxXnzxRd5///0mP87dIikpiUmTJtGvXz++/PLL\nO52cFi0+Pp4HHnjgTifjmsaPH8+RI0du6T5/b/eVObcqX9966y3Wrl17zfWsbmTnffv2RaVSAVBe\nXo6NjQ1qtRqVSsUbb7zBE088cSO7vW5paWmMGDGCs2fPolY3n0rkiy++iLe3N0899dSdTspdoSmv\n46effsqAAQPYsGGD2eVbt27l888/5/z58/Tq1Yv//ve/JsvPnTvHyy+/TGJiIgEBASxevJjAwEBl\n+bvvvsu6detQqVQ88MADPPvssxZv29SmT5/OxIkTiY6Ovi3H++CDD3jsscea9BjDhw9n8eLFhIeH\n3/A+Nm3adAtT1LgVK1bwySefoFKp0Ov16PV67OzsEELg7+/Pxo0bCQwMxN7eHpVKhRACa2trDh8+\nfNvSeCPMlWG3Kl8fffRRpkyZQnR0NFZWDYeoGyopTpw4wfHjxzl+/Di+vr6sWLFC+W78+PE3nOjr\nJYRQLrjUfDXldUxPT6dz584NLndzc2PmzJnMmTOn3rKqqirmz59PVFQUR44cISoqinnz5qHX6wFY\nu3YtO3fuZOPGjfz000/s2rWLb775xqJtm4PruR7Z2dkcOnSIESNGNGGKbo7BYLjtx3ziiSeUsvHv\nf/87ffv25fjx45w4cYKNGzcCoFKp+Omnn5Tv73TguhP5VJeHhwcBAQHs3Lmz0fVu+jFXCFHvR/7R\nRx8pT6BpaWkEBgayfv16hg0bxoABA1i7di2nTp1iwoQJhIaGsmjRIpPt161bx7hx4xgwYACPPfYY\n6enpZo89ffp0APr3709wcDCxsbEIIVi+fDnDhw9n4MCBvPDCC5SUlJjdPj8/n7lz5xISEsKAAQN4\n+OGHlWWBgYGkpKQon+s2Cxw+fJihQ4fy6aefcu+99zJ48GB++eUXdu/ezZgxYxgwYAArVqwwe8xv\nv/2WjRs38umnnxIcHMyTTz6pLDt37hwTJkwgJCSEhQsXotPplGW//vorUVFRhISEMG3aNC5cuGB2\n/wAJCQnMnj2bAQMGcN9997F169YG121sv8OHD2fVqlVMmDCBvn378sorr5Cbm8vjjz9OcHAws2fP\npri4WFn/5MmTPPjgg4SEhBAVFWVyE06fPp3333+fadOmERwczKOPPkpBQYGyDEyvY3JyMtOnT6d/\n//6Eh4ezcOHCBs9hx44djB8/ntDQUGbMmEFiYiIAjzzyCIcOHeKNN94gODiYK1eu1Ns2PDycsWPH\n4uHhUW/Z4cOHMRgMzJgxA2tra6ZPn44QgoMHDwLwww8/MHv2bDw9PfH09GTWrFlKDe/QoUONblvX\nli1b6jW3rVmzhnnz5gGg0+l4++23iYiIYNCgQfztb38z+W388ssvREVF0a9fP0aPHs3evXtZtmwZ\nx44dY9GiRQQHB/Pmm28CcPz4caKjowkJCWHKlCmcOHHC5BotW7aMadOm0adPH1JTU1m/fj0jR44k\nODiYkSNHNvh0vW/fPnr06IGNjQ0AK1eu5M9//rPJOm+++SaLFy8GoKSkhJdffplBgwYxdOhQ/vWv\nf5mUI99++y3jxo0jODiY8ePHc+7cOZ577jkyMjJ48sknCQ4OZtWqVUD965+QkKDsZ/jw4fznP/9R\nfsMGg8GkiT4kJITg4GCCg4Pp27cvgYGBSnnT2L1x9uxZJk+eTL9+/Xj66aeprKw0my+WsPQhobGy\nraac/fbbbxk8eDCDBw9m9erVJtuuXLmSUaNGERYWxtNPP01RUZHJtuvWrSMiIoKZM2cC8NRTTzFo\n0CBCQkKYPn26kq8NlWF181Wn07F48WIGDx7MkCFDWLJkCVVVVUBt+bl69Wql/Fy/fr3JuYaEhLBr\n165rZshNiYiIEPv37zf57sMPPxTPPvusEEKI1NRU0a1bN/H666+LyspKsW/fPhEUFCTmz58v8vLy\nRGZmpggPDxdHjhwRQgjxv//9T4wePVokJiYKg8EgPv74YzF16lSzx05NTRWBgYHCaDQq33333Xdi\n9OjRIjU1VZSVlYkFCxYoafmt9957T7z++uvCYDAIvV4vjh49qiwLDAwUycnJyucXXnhB/Otf/xJC\nCHHo0CFxzz33iOXLlwu9Xi++/fZbERYWJp555hlRVlYmLl26JIKCgkRKSorZ49bdV918nDJlisjO\nzhaFhYXivvvuE2vXrhVCCHH69GkRHh4u4uLihNFoFBs2bBARERFCp9PV23dZWZkYOnSo2LBhgzAa\njeLs2bNiwIABIj4+vt6xr7XfiIgIMXXqVJGbmyu0Wq0IDw8XkyZNEufOnRM6nU7MmDFDfPTRR0II\nITIzM0VoaKiIiYkRQgixf/9+ERoaKvLy8oQQQjz88MNi1KhR4sqVK6KyslI8/PDD4r333mvwOi5c\nuFB88sknQgghKisrxbFjx8zmZWJioujTp4/Yv3+/0Ov14j//+Y8YNWqUqKqqUo773Xffmd22rm+/\n/VZMnz7d5LvVq1eLxx9/3OS7J554QqxevVoIIUS/fv1EbGyssuzUqVMiODjYom3rKi8vF8HBweLK\nlSvKdw888IDYsmWLEEKIN998Uzz55JOiqKhIlJaWirlz54qlS5cKIYSIjY0V/fr1U+5BrVYrEhMT\nzZ57QUGBCAkJET/99JMwGAxi06ZNIiQkRBQUFCjrR0REiPj4eGEwGERxcbEIDg4Wly9fFkIIkZ2d\nrfyOfuvtt98Wb7zxhvI5LS1N9OnTR5SUlAghhDAYDGLgwIFKfj355JPi9ddfFxUVFSI3N1dMmTJF\nfPPNN0IIIbZs2SKGDBkiTp8+LYQQIjk5WaSnpwshqn+TBw4cUI5zresfEREhoqKiRGZmpqisrFS+\n+22ZJYQQS5cuFQ8//LDQ6/WN3hs6nU5ERESIzz//XOj1erFt2zbRo0ePevf0b61fv1788Y9/rPd9\nt27dTMqahjRWttWUswsXLhQVFRXiwoULIiwsTDnP1atXi6lTpwqtVit0Op147bXXxMKFC022ff75\n50V5ebmST99//70oKysTOp1OLFmyREycOFFJS0NlWM3x/vWvf4mpU6eKvLw8kZeXJ6ZOnSref/99\nIURt+fnhhx8KvV4vdu3aJXr37i2KioqUfW3fvl1MmjSp0fy4LR1FKpWK+fPnY2Njw7333ou9vT2R\nkZG0atUKLy8v+vfvz9mzZwH45ptvmDNnDh07dkStVjNnzhzOnz9PRkZGYwFY+f+mTZuYOXMmfn5+\n2Nvbs3DhQrZs2YLRaKy3nZWVFdnZ2aSmpqLRaOjXr5/ZfZpjbW3N3Llz0Wg0jBs3jvz8fB555BHs\n7e3p3LkznTt3brR2ZM6MGTNwd3fHxcWFiIgIzp07B8B3333Hgw8+SFBQECqViqioKGxsbIiNja23\nj19//RV/f3+ioqJQqVR0796d0aNHs23btnrrWrLfhx9+mNatW+Pp6Un//v3p3bs3gYGBWFtbM2rU\nKCWNP/30E8OGDWPw4MFAdY2mZ8+e7N69W9nX5MmTadeuHTY2Ntx3333KtjXq5rmVlRVpaWlotVps\nbGwIDg42m2dbt25l2LBhhIeHo9FoePTRR6moqDCpUdyosrIynJ2dTb5zcnJSnnZ/u9zZ2ZmysjKL\ntq3Lzs6OESNGKLWay5cvk5SUpDTBrVu3jhdffBFnZ2ccHByYM2eOsu66deuIjo5W+oA8PT3p2LGj\n2fPZtWsXHTp04P7770etVhMZGUmnTp349ddflXUmTZpEQEAAarUajUaDRqPh4sWLVFZW4u7uTkBA\ngNl9FxcX4+joqHz29fXlnnvu4ZdffgHgwIEDODg40KtXL3JyctizZw8vvfQStra2tG7dmkceeYTN\nmzcr5/TYY4/Ro0cPANq2bYuPj4+y77q/E0uu/4wZM/Dy8lJqheZs2bKFTZs28eGHH6LRaBq9N2Jj\nY9Hr9cyYMQONRsOYMWPo2bNng/u2xKRJkwgJCSE0NFSpnf6WJWXbn/70J2xtbenatSuTJ09W8vTb\nb7/lL3/5C56enlhbWzN//nx+/vlnZVuVSsWf/vQn7OzslHyaPHky9vb2yvrnz59vsBXLXFrnz59P\nq1ataNWqFQsWLODHH39UlltbWzNv3jw0Gg1Dhw7FwcGBpKQkZbmjo6NJq445NzRg40a0adNG+b+d\nnR3u7u7KZ1tbW+WmT09PZ/Hixbz99ttAbX+IVqs1+QE3JCsrC19fX+Wzn58fer2enJwcPD09TdZ9\n7LHH+PDDD5k9ezYqlYopU6aY7fswx83NTRm0YmdnZ/Yca87JUnW3t7e3Jzs7G6jOkx9//FEZLSeE\nQK/Xk5WVVW8f6enpnDx5ktDQUGVdg8FAVFSU2XWvtd+6abK1ta33ue5127p1q1IQ1uyrbsd63Wtu\nb2/faP4899xz/Otf/yI6OlrplzI3ku2311ulUuHj44NWq21w35ZycHCod7OWlJTg5ORkdnlJSQkO\nDg4WbftbkZGRvPPOO8ybN49NmzYxcuRIbGxsyMvLo7y83OTcjUajUoBnZmYydOhQi87nt3kF1UGm\nbl55e3sr/7e3t2fZsmWsWrWKl156iX79+vHcc8/RqVOnevt2cXGhtLS03jlt3ryZiRMnsmnTJqU/\nPD09Hb1ez6BBg4Daroea+zszM5N27drd0DmZu/51z8mcs2fPsmjRIlavXo2bm5uSxsbuDS8vL5N9\n+Pn5WZTehmzYsIG2bds2uk5jZRtUn3vdc/X19eXSpUvK+SxYsEAZECWEwMrKStkWTPPJaDSydOlS\nfv75Z/Lz81GpVKhUKvLz8xv8DTeWVl9fX5Nyxc3NzWRwlp2dncnvp7S0tN7D32/dtuBlKW9vb558\n8kmLBn7UBI+6PD09TfrI0tLSsLKyMik4azg4OPD888/z/PPPk5CQwPTp0+nVqxdhYWHY29tTXl6u\nrJudnX3Nm6CpeHt7M3fuXItGcfr4+DBgwAClP+BW7deS40ZFRfHGG29c97bmrmObNm2UvtBjx44x\na9YsQkND693gnp6eyg1aIyMj45Zcqy5durBmzRqT7y5evKj00XXu3Jnz588TFBQEVPdZdunSpdFt\n6/ar1jVo0CBefPFFzp8/z+bNm3nppZcAaNWqFfb29mzatKnewxdUX8O6fbN1/TZfPT092b59u8l3\n6enpDBkypMFtBg4cyMCBA9HpdCxbtoxXX32Vr776qt6xunXrZvJkDTB27FjeeecdtFotv/zyizKY\nxcfHB1tbWw4dOmT22nt7e5OcnGzxOd3M9c/Ly2PBggW8/vrrJiNBG7s3jhw5Uu/hKD093eKAe6Ma\nK9syMjIQQpCRkaHUvDMyMpTfjI+PD0uWLKFv37719puWlgaY5u3GjRv59ddf+fzzz/H19aW4uJiQ\nkJDrSmtaWppSU09PTzf7+21IQkLCNUfm3pZmw2s1wdU1bdo0VqxYQXx8PFDdHGGuyQugdevWqNVq\nkx96ZGQka9asITU1ldLSUpYtW0ZkZKTZIdi7du1StnVwcFCaSaB6wMamTZswGo3ExMTc0vdC3N3d\nGyxwzPnDH/7A2rVriYuLA6qbpHbv3m225jJs2DCSkpL48ccf0ev1VFVVcerUKWUQw43u91omTJjA\nzp072bt3L0ajkcrKSg4fPmxRDcjcddy2bZuyrYuLC2q12uw1vO+++9i1axcHDx5Er9ezatUqbG1t\n6dOnj0XpNhqN6HQ69Hq9yf8BQkNDUavVfPHFF+h0OuUpfMCAAQBERUWxZs0atFotWq2WNWvWMHny\n5Ea3DQsLM5uOmuand955h6KiIgYOHAigtAgsWbKEvLw8ALRaLXv37gUgOjqa9evXc/DgQYQQaLVa\n5Vr/9nc2dOhQrly5wubNmzEYDGzZsoXExEQiIiLMpik3N5edO3dSXl6OlZWVco+YM3DgQM6cOWMy\nkKR169aEhITw4osv0rZtW6XG5uHhwcCBA1myZAklJSUIIUhJSVHusSlTpvDZZ59x5swZAJKTk5Vu\nA3d3d1JTU5Vj3Mz1NxgM/OlPf2LChAmMHTvWZFlj90afPn2wsrLiiy++wGAwsH37dk6dOnXN490s\nS8q25cuXU1FRwaVLl1i/fj2RkZEATJ06laVLlyrBLy8vjx07dijb/baMLi0txcbGBhcXF8rKynjv\nvfdMgtu1yrDIyEg+/vhj8vLyyMvLY/ny5UycONHicz1y5IjJQ5U5Nx28zD05XWudxj6PHDmSxx9/\nnKeffpr+/fszYcIE9uzZY3a/dnZ2zJ07l2nTphEaGkpcXBzR0dFMnDiRhx9+mFGjRmFvb88rr7xi\ndvvLly8zc+ZM+vbty7Rp03jooYeUp4uXX36ZnTt3EhISwubNmxk5cuRNnWNd0dHRxMfHExoayoIF\nC665fs+ePVm0aBFvvPEGoaGhjBkzpsH3lhwdHfnss8/YsmWLMurovffeMylULN3v9ZyTt7c3y5cv\nZ8WKFYSHhxMREcFnn32m3BSNbWvuOp46dYopU6YQHBzM/Pnzefnll802zXTs2JF3332XRYsWER4e\nzq5du/jkk0+U90Ou9fv88ccf6dWrF2+88QbHjh2jd+/eygv21tbWLF++nA0bNhAaGsr69etZvny5\nsu8HH3yQiIgIJkyYwIQJE4iIiOAPf/iDRduaExkZyYEDB7jvvvtMCqS//vWvtG/fnj/84Q/079+f\n2bNnc/nyZQB69erFkiVLWLJkCf369WPGjBlKQT9jxgy2bdvGgAEDWLx4MW5ubnzyySesWrWKsLAw\nVq1axYoVK3B1dTWbV0ajkdWrVzNkyBDCwsI4cuQIr7/+utm0t2nThrCwMKWPq8b48eM5cOAA999/\nv8n3b7/9NlVVVURGRhIaGspTTz2lNJOPHTuWuXPn8swzzyjXv7CwEIA5c+awfPlyQkNDWb169Q1d\n/5rvMjMzOX78OJ9//rky2jA4OJjMzMxG7w1ra2s+/PBD1q9fT2hoKNu2bWP06NENXtdrsaQMBSwq\n20JDQxk1ahSzZs3iscceU5rtH3nkEUaMGMHs2bPp168fDz74oBKYzaUhKioKHx8fhgwZwvjx4+vV\n2K5Vhs2bN4+ePXsyYcIEJk6cSM+ePZk7d65FeZCVlUVCQsK1y1xxPdUiSZKkBiQkJPDCCy/w3Xff\n3emk/O6kpaUxcuRIzpw506wmbDDn7bffpl27dkybNq3R9WTwkiRJauaa62xDN+P3cZaSJEktnKXN\njy2FrHlJkiRJzY6seUmSJEnNzl33ntfN0OsN5Odf/zDvlqhVKweZF1fJvKgl86KWzItaHh6NvxB8\nN2pRNS8rK/PvoPweybyoJfOilsyLWjIvmrcWFbwkSZKk3wcZvCSpBSsq1VFcVv8FdUlq7mTwkqQW\n7J9rT/LBurhrryhJzUyLGrAhSVIto1GQnlOKrY3s25FaHlnzkqQWqri8CqMQlFfq0Rvq/z07SWrO\nZPCSpBaqsKT2T9MXl1XdwZRI0q0ng5cktVAFJbUDNeSgDamlkcFLklqoujWvIhm8pBamyYNXTEwM\nY8eOZcyYMaxcubLe8qNHjzJ58mR69OhR76+8QvWfTh8yZAhvvvlmUydVklqUgtI6Na9S2WwotSxN\nGryMRiOLFi1i1apVbNq0ic2bN5OQkGCyjq+vL2+99Va9P1ZX4/333yc0NLQpkylJLZJpn5eseUkt\nS5MGr7i4ONq3b4+fnx/W1tZERkaa/OlpqA5eXbt2NTud/+nTp8nLy2PQoEFNmUxJapEK6/R5FckB\nG1IL06TBS6vV4uPjo3z28vIiKyvLom2FELz99ts899xzyL/aIknXr6BU1ryklqtJX1K+maDz9ddf\nM2zYMLy8vK5rX81xduSmIvOi1u8xL4rL9djbaiivNFCpF0oe/B7zoiEyL5qvJg1e3t7epKenK5+1\nWi2enp4WbXvixAmOHz/O119/TWlpKXq9HkdHRxYuXNjodtnZxTeV5pbCw8NZ5sVVv8e8EEKQV1iB\nn4cjqVkl5BaUkZ1d/LvMi4bIvKjVHIN4kwavoKAgkpOTSUtLw8PDg82bN7N06dIG169bu/rnP/+p\n/H/Dhg2cOXPmmoFLkqRqZVdn1WjlZEthSaUcKi+1OE3a56XRaHj11VeZPXs248ePJzIykoCAAD74\n4AN+/fVXAE6dOsXQoUPZtm0br7/+eoOjDiVJslxBcXV/l6uTDS4ONnLAhtTiqEQLGw0hmwGqySaR\nWr/HvDhzOY/31p5kwsAOJKQVcuZyPp88MxQ/X7ffXV405Pf4u2hIc2w2lDNsSFILVPOOl5uTLc6O\nNoCc31BqWWTwkqQWqOYdL1cnG5ztrwavctnvJbUcMnjdJpU6A2u2nufoecvec5Okm1EzKa+bky0u\njtYAFMkpoqQWRAav2+RkfA4xseks/+E0KzeeobRCFiRS0ym8+oKyq6MNzg41zYay5iW1HPIvKd8m\nlzOLAGjjYsfBM1ouJBfwxIQedG3rdodTdnuUlFdx8lIO+cUV5JfoqKjUM3FQR7xaO9zppLVIBSU6\nVICLow3ODtU1L9nnJbUkMnjdJpczilEBf58dwi9HU/lp32VWbz3PP+aE3emk3RZfbr/A4XOmTaat\nXGyZMqzzHUpRy1ZYUomzgzVWGjUuV2te8l0vqSWRzYa3gVEILmuL8XF3xMHOmgmDOtLZz4WsvDKq\n9IY7nbwmpzcYiUvIpY2LLU//oTcvPhwMQFp26R1OWctVUKrD1ckWoE7Nq+UEr5LyKjnn6e+cDF63\ngTavjEqdgQ7ete9SeLdxQADa/PI7l7Db5GJKARU6A327eBDUqQ1d/N1wdbIhNbvkTietRarQ6anU\nGXB1qq5x1fZ5tYxmw0upBfz5/T38c+1Jsgta/v0jmSeD121wObP6RUiT4NXaEYDM3LI7kqbbKTY+\nF4Dend2V7/w9nMgrqqRMDly55WqGybs5Vte87Gw0WGnULabmde5KvvLvq6sO8cvRFIyyFva7I4PX\nbXA5oyZ4uSjfeV8dqJCZ1/KDV1xCDrY2GpPBKf4e1cE7VTYd3nIFJbVTQwGoVCpcHK1bzFD5mt/M\nlIgArDVqvv7lEv/dduEOp0q63eSAjdvgcmYRKhW09XJSvvNu8/sIXpl5ZWjzy+nX1QNrq9pnJX+P\n6rxIyy753Yy4vF56g5HSCj2uV2fIsFRhae07XjWc7W3IyL27HxTKKqrYeiiZvKJKist1lJRVMaS3\nL8P6+pmsl5Zdgr2tFWND23FvTx+WfHGUg2cyeWhUF6ytNHco9dLtJmteTcxoFFzRFuPn7oitde2N\n5e5qh0atQtvCg1dsfA4AvTq3Mfm+JnjJmlfDvvrfRZ7/ZD9FpdfX3KdMylsn6Dk7WqPTG6mo1N/S\nNN5K2w4ns/nAFQ6cyeR0Yh6XM4v5+UiKyTpVegPavHL8PBxRqVS4OtrQr6snOr2R88kFdyjl0p0g\ng1cTy8gtRVdlNGkyBLDSqPFwsyczr6xFj5pSgleAu8n3Pm0cUKmQgzYaUFpRxf7TmeiqjCSmF13X\ntgVmal41w+VrmhTvNlV6I7tPpuNoZ8U/ngjj44VD6d6+Fdq8MpMX+tNzyjAKQVuP2laMoIDqB6O4\nq32rUjVtfhknLmbf6WQ0GRm8mpgyWMOn/qzN3q0dKK3QU1x+Y30Rd3vQK6uo4lJqIR19XOo1fdlY\na/Bq5UBqduldfx51ZeSWsnzDKfKLmzYIHDidSZXeCNS+4G6pwt/0eUHtcPnrrcXdLkfPZ1FcVsXg\n3r54tXLA1kZDJ9/qB76kOsG75mGnps8UoIu/K/a2GmITcprVb6mprd5yng/XnyIhvfBOJ6VJyODV\nxMwN1qihDNpoZMThtzvj+b9fLtX7Pq+ogqc+2Mv2w8m3KKW33umkPAxGQe+ANmaX+3s4Ul6pNwkE\nCWmFrN1xCYPReLuSeV02xCRy9EI2/zuaUm9ZUZmO9JybbwYVQhATm45GrQJqH4AsVTuvYW3wuttr\nXjuOp6ICIur0b9UEr8SM+sHLr07Ny0qjpkeH1uQUVpDxOxi9a4n84koupVQ3o/58uP5vtSWQwauJ\nXc4sQqNW0dbTsd6yaw3aiE8tZNvhZP53NKVeoRgTm05JeRUHz2pvfaJvQn5xJTmF5RSWVHL8apNF\n3SHyddX2e9U2Ha7deYntR1K4eBf2X+QUlnPs6jntP52J3lAbYIUQfPh9HH9bfZicm3z3KDG9iNTs\nUvp29aCNiy2XM4quq0ZRWKrD0c7KZPCCU03N6y4MXkkZRSSmF9G7szsebvbK9518XQFMmk1r+kjr\n1rygtlk6LkE2HQIcOZ+FADRqFccuZLXI9+Fk8GpCeoOR5KwS/DwczY6Camy4vBCC73cnKJ93nUxT\n/m8wGtkTlwHAFW3xLZvk1yjETTW7nLyUwzP/3sdzHx/g6Y/2cfhcFm5ONrSrM8qyLj9lxGF1gZSZ\nV0ZCWnVBdSn17mvq2Hk8DSGqB9sUleo4lVhbUF5MKSAhrQi9QbD54JWbOs7u2HQAhvb2pYO3C0Vl\nVdfVTFlYUqnMrlGjtuZ19zUb7jyWCsDwfqajCl0dbWjjYkdiem3wTs0uoY2LLQ521ibrKv1eCTm3\nIcV3vyPntKhUED0sACEw21LQ3Mng1YTSc0qp0tcfrFGjsWbDM5fzuJBSQI+OrXF1tGH/qUwqq6qn\nkjqVmEd+cSU21mqE4JbUUsor9Ty7fD///fnG3pcxCsH6mERUKgjr4UVIoCd9u7jz4IguqFQqs9v4\ne9a861WD0HifAAAgAElEQVRd89p/OlNZdjH17qp5VeoMxJxMx8XBmicm9gBg79UHCIBth6qbb53s\nrdkbl0FuYcUNHae8Us/hc1rcXe3o3qGV0ldqadNhld5gdnh9zSwbhXdZzauoTMehc1l4tXbgng6t\n6y3v5OtCSXkV2YUVFJfpKCzRmTQZ1nB1tKGDtzOXUgspq7gzIypv9kXp3zah36icgnIS0osIbNeK\nEf38aeVsy57YjBb3lyxk8GpCjQ3WgOpOdAdbq3o1r+paVyIAU4YFMLi3L2VXCzWAmJPVT+YPDAkA\n4Fxy/k2n9XRSdUDcfTKds5fzrnv7ExdzSM0uYcA9Xsy5vwdPRvXkTw/0IrS7V4PbeLjZY2OtJjW7\nFKMQHDidgZ2NBk83exLSiu6qfq/9pzMoq9QzrK8fAb6utPNyIjY+l8KSStJySolNyKWznysPjuiM\nwXjjta9DZ7XoqowM7u2LWqVSHnwsHbRRaKa/C8DlarPh3RS8Siuq+GFPEnqDkeHBfqjNPOQo/V7p\nhXWaDM3X5HsFtMFgFDf0+71Ze2LTmbd0NxdTbuyhq7xSz5v/Pcqrnx666Vlnjlz9m4ED7vHCSqNm\nZH9/KqsM7L5abrQUMnjdpLKKKtbuuGS2Tfn81WlsOjZQ81KpVHi3cSC7oNykoD52IZsrmcWEdvek\nnZczQ3v7olLBrhPp5BVVEJuQQwdvZ4b19cPaSq0c52bUHVL7xc8XrmvCYCEEG/cloQLGh3eweDu1\nSoWfuyMZuaWcu5xPblEl/QM96d6hFZVVBpK1t3YYfUZuKa+tOsSWRgJLld5ITGw6b311nP/+fIH8\n4kqMQvC/o6lo1CplQMHgXr7VAfeMlp+v1rrGDmjHgHu88Gxlz57Y6mt1PfQGI7+eSEOtUjEoyAeA\n9lenFKsZ+NMQo1EQl5DD59vOA9RrNqyted1cs2FZhf6mA2BiehErN55h4Uf72HUiDRdHGwb29DG7\nbm3wKjI70rCumr7V2NvcdGgUgi0Hr6CrMvLpprOUX+e7dEIIVm0+R0ZuGWWVemJiM669USMOn8tC\no1YR3NUDgKG9/bCz0fDL0RQycku5kJzP0fNZN9w6cLeQM2zcpB/2JvHL0VS0eWU8NaW38n1uYQVH\nzmfh08bBZGaN3/Ju7UBiehE5BRV4tXbAaBRs2JOIWqUianAnANq42tGrUxtiE3L5+pdLCAFD+/hi\nbaWmi78rZy/nU1Sqw+U6Z2KooTcYiU3IpY2LHX27uvPL0VS2HExm4qCOFm1/Mj6H5KwSQrt74utu\nvmBpiJ+HE0kZxayPqa5pDuzpTV5RdQ3wUkoBHX3MB/7rVVSqY9m3seQUVrBuVwKujjYMDKotMCt0\nenYcS+WXo6nKDBUXUwrYfyqD3p3dycwr496e3kpQGHCPF9/svMTO46nkF1fi1dqBPl3cUatUjA/v\nwGdbzrH54BWmj+5mUfqMQvDZ5nOkZFXXXls5Vx/Hyd4ad1c7LmcWI4Qw2wR75nIen289T87VwqiT\nr4vJuQHY2miwsVYrf6SyMWcu53HyYg4TB3fEyb62b6msooq/rT5CaYWev88Kwb3O4ApLZeWX8Y8v\nj2EwCrxa2TOkty/3BvngYGe+KGrv5YxGrSIpvQjd1Wbzhmpe7b2dcXG0ITY+l437krC11mBtrUGv\nN1JRZUBXZSCwfSt6mGmevBnnLuejzS/H0c6KnMIKvv01nkfGBlq8/ZaDVzh+MZvO/q6kaEv45VgK\no0L80ajN1y1yCsrJK640OzONNq+MK9pigjq1Ua6dg50VQ3r7sv1ICi//55Cybr+uHsyfHHSdZ3v3\naPKaV0xMDGPHjmXMmDGsXLmy3vKjR48yefJkevTowfbt25Xvz58/z4MPPsj999/PxIkT2bJlS1Mn\n9bplFZTz6/HqgRSxCbkmo6K2H0nBYBTcN6C92eaQGjX9XhlXmw73nsogI7eMgUHeyjJAmSLn+MVs\nbG00SnNc9/atADh/E02H55PzKa/U07erO5MGd6KVsy2bD1y2aOoqIQQ/7b2MCrh/oGXBri7/q8Eu\nKaMId1c7urR1o0vb6lFmF2/RoI0KnZ7318WRU1jB0D6+ONhasWbreaWJJz6tkL99doTvdydSWWVg\nbGg73pkbzsz7AnG0t1aaYUb1b6vs08nemr5dPMgprMBgFIwJbatc5/CeXni42bEnNp2kjPrNffFp\nhRw4k6k0Dwkh+HL7RQ6e1RLg58LM3xR8HXyq+33MPSnnFJTz8YbTFJRUMqS3D6/PDOGVGf3xM/MQ\n4eJgQ2EjfSpVeiPf7LzEe2tPsuN4Kh//cFoZUSmE4L8/XyCnsILySj2fbjqL0Xj9fTw198VDo7qy\nZE4Y94W1b3T6KxtrDf4eTlzRlnA5sxiNWqWM0v0ttUpFv64elJRXsWFPEmt3xvPFzxf4vx2X2BCT\nyOYDV1i+4dRN9f2cuZxXr1lv5/HqASd/eqAX/h5O7D6ZbvGoxzNJeayPSaSVsy0LJgUxMKj64e3Y\nBfMvF5+8lMNrnx3mra+Om32lpKZrIbS7p8n348LaE9bDi0G9fIgMb88fR3Zh6vDm/bf0mrTmZTQa\nWbRoEWvWrMHT05Po6GhGjBhBQECAso6vry9vvfUWn332mcm29vb2vPPOO7Rr146srCwmT57MkCFD\ncHJquBZzu22IScRgFAzr68euE2n8uDeJp//Qm5LyKmJi02nlbEtYj4b7fMB00EZlOwM/7EnExkqt\n1LpqBHVqQxsXO3KLKhjQ3Qt72+pLF6gEr4JG+5cac+JidTNLcBcP7G2tmDaiC8t/OM0XP1/gmQf7\nNBp8YxNyuaItJiTQ02yBeS1+nrXX896e3qhVKtxd7WntYsul1AKT2sbxi9moVSr6dDE/9N4cg9HI\ne18dIymjiHt7ejNjTDdCAz1Z+m0sH60/xb09vatHYgkYG9qO8fd2UGoBQ9zsCbvHi10n0hDUNuHV\nGNzLhyPns3BxsGZgT2/le41aTdSgTvxn01kWfX6UPp3dGX9vB/KLK9l2+IoyotLaSk2/bh7YWGmI\niU2nracTT0/pja2N6cjUjt7OHD2fxeXMYpPajt5g5OMfz1BWqWfWuEAG9/JtNC+cHayVl8Lr1uCM\nRkFSRhFf/HyB5KwSvFrZ08bVjrOX8/lmZzwPjerKgTOZHD6XRYCfC26Othy7mM22w8mMC2tv8bUo\nLtOxNy6DNi62DO3j2+BAnt/q5OvCFW0xydoS/D0csdI0/Mz94IguhPf0plJnoPJqbcvaSo2tjYaz\nl/PZdiiZnw+nMHlI7f1VXqlnQ0wirVxs6dGhNf6eTmZ/86cTc1n6bSyd/Vx57o99sdKoyS2s4GR8\nDu29neni78rj99/DG2uOsHrrOSYO6kh8aiHxqYU4OVgzc2wg/nV+76cTc1nx0xk0ahXzJvXExdGG\nUf3b8uvxNLYfSTG5n4UQbD5whQ0xiVhZqfFws2P7kRTSckqZO7EH1ho1567ks/dUBlYaFX27eJik\n3cXRhjn397Aov5uLJg1ecXFxtG/fHj+/6lpDZGQkO3bsqBe8gHo/5Pbta28KT09P2rRpQ15eXqPB\nq6lmD9AbjPxn41kc7ayYEtEZe1srkjKKOHRWS3tvZx4e3ZXM3FJOJeYSn1bI2aQ8KqsMRA3u2OiN\nBqbD5bcfTaGgRMf4e9srzUY11GoVYwe045ud8QwPrh1S3MHbGTsbjfJnIqC6U/7AuSw6eTnh1dr8\nU2oNoxCcuJSNk721UuPp182D3gHVzZQ/H07mvgHmC6hKnYFvdsYDcP/ADo0epyF1m4DC6wSALv5u\nHDqrJTOvDJ82jqTnlLJ8w2mMQjDzvkCG9DZfUFfqDOyJS+dCcgEZeWVk5ZehNwgC27kx875AVCoV\n3Tu0ZvqYbqzZep7tR1Jo42LHY+O7061dq3r7s7HWMDq0ndlj3dOhNSOC/Qls71bvVYjwnt44O1jz\n077LnIzP4WR8bT9Mn87udPB25sCZTA6eqX5S9mrtwDNT+9QbAg61f0onKbOI/oG1T9TrdiWQlFFE\neA8vpY+sMc4ONlTpiymr1KPNK+dMUi4XUwtJSCukQlfdJDe4lw/TRnZBCFjyxTF2HEvF0c6K7UdS\nsLPR8Pj9PXCwtSI+vZANMYn06NC6XlBvyK8n0tDpjYwKaXfN+6KuTr4u/HqiuoWjoSbDGtZWajr7\nuZpd1sXfjf2nM/nlaAqjQ9pSU7x/uf0iB85Uj3T9jgScHawZHdKWyN/032692rcZn1bI97sTmDq8\nC7tjq1+fGN7XD5VKRVtPJ6IGd+T73YnKTPf2thqyCspZ9N+jTBvRhUG9fPhhTxJbDl7BSqNi1n3d\nCbj6TptXawd6d3bnZHwO8WmFdPZzpbBUx5fbL3DsQjatXWz50+ReeLjZs3LjGeIScnn5P4eoqNSj\nuzojy5Devg02w7YkTXqGWq0WH5/am8rLy4tTp05d937i4uLQ6/W0a2e+EKnx0GtbmTaiC6NC2ja6\nXg0hBMnaElq52CrvwZizcd9lpenodFIecyb0YMPVPpo/DAtQ+qfe+uo463YlkJ5TiuPVduZr8Wxl\njwpISC/k8Dktzg7WDQaL4cF+DO7lg02dCX41ajVd27oRl5BLXlEFGo2at78+oTT5dW3rxuBePrT1\ndEKjVqFWq2jtbKc83SdlFFFQomNgkLfSxq5SqZg1rjuvrz7M97sS6eLvZrZA+L8dl9DmlTE6pO01\nC5WGuDra0M7TiVbOtni1qg20Xf1dOXRWy6XUQnzaOPLdr/EYhcDGWs3nW89jY6UmrEdtsCutqFL6\nrEquTrdlb6uhracTgR3bEBna1qTAHNLbl0qdgdyiCiYO6qjUZK+HWq3iodFdG1zes1MbenRszfkr\n+ew4noaTfXWhWNMveP/ADlxKLeR0Uh7D+vg22GdpbtDGiUvZbD+SgndrB6aP6WZRLaZmiqjnPt5P\neWXtgBzv1g508XelXzdPetWZDeVP0b1YtOYIP+27DMBj47vjebXm9+i47iz9NpaVG8/w2swQk0mn\nzdFVGdhxLBV7WysG97p2oK2rZtAGgF8DgzUsYWutYVxYe9buuMTPh5OZ2641h85qOXAmk44+zozs\n35azSXnEJuTy/e5EurVtRWf/6t/9lcxizl3Jp4u/K8VlVfx8OIWOPi7EXJ2PMfSe2lrSfQPao1ar\nsLPW0KWtG77ujsTG5/DZ5nP89+cL/Lg3icJSHZ5u9syN6lHvVZpRIW05GZ/DtkPJdGvnxg97Eimv\nNNC1rRvzonoqv5M/P9CLDXsS2XYouTroBbShd2f3BoN3S9OkwetWzDOWlZXFc889xzvvvHPNdVs5\n27J25yU6+LtxbyNNKAaj4NDpDNbviufClXzcnG15ZVYo3drX78i9mJzP5oNX8Gxlz+A+fqzfFc8/\nvjyGEBAc6MmQkOpA4+HhTJ/DKZy8VN1WPXVkV9r513+SN8ejtYPyou7MyHss3q5GSA9v4hJyOZ9W\nxC+Hk8nMK2NYP3/yCiuIi8+pN3zXyd6a52f0p09XT7ZcnTpmWP92eHjUPkF7eMDz00N45ZN9rNx4\nlg+eGaaMWAM4cCqdmNh0Ovq6MDe69039KYoPnx2uzAZQY0AvP77YfpHk7FLSCyqITcilZ0AbHpvQ\nk5c/3senm89hY2dNhc7A8QtZnE7IRVdlwMnemgdHdWP0gPa4u9k1Wqj/cdw9N5zm6+Hp6aL8Tswt\nGxh87YctH3dHkrNKcHd3YvfxVFZuPIuNlZqXZoXS1teywiqwozv7TmXiaGfN4D7+BHfzpGdAm3oj\nE2t4eDjz4sxQ/v7pQQb38WPCsNp39iI8nLmYVsSmfUms3nqBF2eG1KtNGY0C9dVruu3AZYrLqoge\n3uW6f99t2jjhaGdFaYWeHp09TH6n1yt6VDe2H0lmx7FURoZ14MvtF7Cz0fDCI6H4ejgxYRicTcrl\n+Y/28u2ueN57aihqtYrPf74IwEP3dcfd1Z6F78ew8qczGAVEDQ3A39d08MSM8T1NPo/2dKFvdx/+\n+dVRziblMaSPH/On9DZb03Z3d2Ld7gSOX8zm+MVsHO2tmTu5B2PDO5jcIwBzo/vw+OTe9b7/PWjS\n4OXt7U16eu27BVqtFk9Pz0a2MFVSUsLcuXNZuHAhvXr1uub6rz0Wxgsf7eWfXx3jWaPR7BNIfGoh\nqzafRZtfPbS9W1s3LqYW8MK/9zF7XKDJ07yuysA/vzyK0Sh4ZGwg3du3orOPM//ZdJbCEh0TwtuT\nnV37NDxuQDtOXsrG2kpNeHdPk2WN8XS1IyuvDK9W9gR3bmPxdjXaXu3A/vTH0wCMCPbnqWnB5OSU\nkFVQzqGzWopLdRiMAl2VgUPntLy+8iAPjujMvtg0bKzU+Le2r3dcb1dbJgzsyA97k3jn8yPMmXAP\ndjZW5BdX8v7aE1hbqZk9rjsF+bd+Pjk7DTjaWRF3KZv4q4NRJg/uiIuthqem9Oa9tSf56LtYZX1f\nd0cGBnkzrI9fdS1Krycnp3potYeH83Xn6d2mrYcjh3NK+ft/DnDsQjZ2NhrmTOyBk7Xa4nML7+7B\nkL5j0FfolCCkK9eRXd5wc7uvmx3LFgzC3laj5GeN+8Pbk5RWwOGzmbzz+REeHd8dtUpFUkYRq7ec\nI6ugnF4B7oQGevJ9TCIateq67ou6Ovq4cDopDxdbzU1fy7Gh7fj6l0s8/9FedFUGZt4XiDVC2a+H\nkw1h93hx8KyWDTsvck+HVuw5mYa/hyNtW9ujUqmYProrqzafA2BANw+L07RwSm+0+WXVk3IXV1Ba\nbH64emRYez7+4TThPb2JHhaAi4MNeblN9xcYbuaB4E5p0uAVFBREcnIyaWlpeHh4sHnzZpYuXdrg\n+nVralVVVcyfP5+oqChGjx5t0fE6+7vxZFRPPlgXxwfr4njuj31NmrMOn9Py6aZzGI2Cwb18GDug\nHT5tHDmVmMsnP55m5cazxKcV0qNja/zcHdlxLI2M3DJG9vNXRvV1a9eKxY+FUVymqzdUuLO/K9HD\nAnB1tLmuYettPZ04nZRH9LCA6+oLqOHvWftkOqS3D9NG1T4he7rZc/+9HUzWH9rHj482nOLrqxP+\nBnf1aLDZZ/y9HbiQUsDJ+BzmLY3BzckGlUpFaYWe6aO73tAgDUuoVSo6+7kSe3XUVngPb6V5pbOf\nKwun9mZPXAZd/F3p0aE1rV3smiQdd4sO3i4cPpfFsQvZtPV0Yt6knibNrJZQq1S0drEju/L6Rts1\n1H9ibaVm/uQg3lt7kgNnMnGws8LWWsPWQ1cQAtq42HH0fBZHrza5DwryqdeXa6lpI7uQll16S67z\n0D6+bD2UTH5xJcFdPcw2Y06J6MyJSzl8vzuB+FR3jEIwJrSdcl8NDPKpfgfQKK7Zr1yXWq3Cp821\n75ngrh6s+OswpeYq1acSTfw3BGJiYli8eDFCCKKjo5kzZw4ffPABQUFBREREcOrUKRYsWEBRURG2\ntrZ4eHiwceNGfvrpJ1566SW6dOmijI76xz/+QWBg4+9PZGcXExObzpqt51EBfbq4Mzqk7dVO1kTs\nbDTMi+pJz06mM52n55Tywbo4sn7zsrFXawf+Nuvabfo3o6xCT0pWsdkBA5baE5tOdmEFUYM6olar\nrlnbyCuq4IPv40jWljBnwj2E3ePd4LrFZTq2HLxCanYp2rwycgsrCO7qwbxJPS0eMXYjth68wne7\nErC2UvOPOWE3XHC1hJpXSlYJiz4/SngPLx4a1dWk3/N6NEVelJRX8fZXx0m7Onm0u6sds8Z1J7Cd\nGylZJRw+l8UVbTEzxnQzmXj3Tjp+MZtD57N4eGQXk+bwujbtv6y8f9jK2Za354bf0MNlc9Aca15N\nHrxut5ob89iFLDYfuGIyJ1wrZ1v+MqU3bT3NDy6o0Ok5dyWf9JxS0nJKyS+qZOqIzg3OTXg3s6SQ\nqqwycCWzmC7+rtcVhPQGIxq1qkkDF0BqVgmvrz7MxEEdmXAD75DVaAnBC6rz/WYLz6bKi/ziSlb+\ndIa2Xk5MHtIJO5u7f7TbtfKiSm/glU8PkV1QwZSIgAYHUrUEMnjdBer+GIUQJKQVsf1oCuWVemaP\n637DzRbNTUspsEvKq3C0s7qpQNlS8uJWkHlRy5K8iE8tZPfJNP44qusNjUhtLppj8Gq5V4PqId+d\n/V2V4a5S81N3eiJJut1k+XH3apkNuJIkSVKLJoOXJEmS1OzI4CVJkiQ1OzJ4SZIkSc2ODF6SJElS\nsyODlyRJktTsyOAlSZIkNTsyeEmSJEnNjgxekiRJUrMjg5ckSZLU7MjgJUmSJDU7MnhJkiRJzY4M\nXpIkSVKzI4OXJEmS1OzI4CVJkiQ1OzJ4SZIkSc2ODF6SJElSsyODlyRJktTsyOAlSZIkNTtNHrxi\nYmIYO3YsY8aMYeXKlfWWHz16lMmTJ9OjRw+2b99usmzDhg2MGTOGMWPG8MMPPzR1UiVJkqRmwupa\nK6SkpLBu3ToOHTpEZmYmtra2BAYGMmbMGEaPHo2VVcO7MBqNLFq0iDVr1uDp6Ul0dDQjRowgICBA\nWcfX15e33nqLzz77zGTbwsJC/v3vf7NhwwaEEEyePJkRI0bg7Ox8E6crSZIktQSNBq/XXnuNM2fO\nMHbsWP7617/i7u5OZWUlCQkJ7N27l5UrV/K3v/2NPn36mN0+Li6O9u3b4+fnB0BkZCQ7duyoF7wA\nVCqVybZ79+5l4MCBSrAaOHAge/bsYdy4cTd+tpIkSVKL0GjwGjFiBG+88Ua977t168a4ceMoKCgg\nJSWlwe21Wi0+Pj7KZy8vL06dOmVRwsxtq9VqLdpWkiRJatkaDV5Dhw5tdGM3Nzfc3NwaXC6EuLFU\nNbDtb2tn5nh4yGbFGjIvasm8qCXzopbMi+brmn1eAG+99Rbz58/H3t6eGTNmcPbsWf7+978zceLE\nRrfz9vYmPT1d+azVavH09LQoYd7e3hw6dEj5nJmZSVhY2DW3y84utmj/LZ2Hh7PMi6tkXtSSeVFL\n5kWt5hjELRptuH//fpydndm7dy9eXl78/PPP9QZYmBMUFERycjJpaWnodDo2b97MiBEjGly/bm1r\n0KBB7N+/n+LiYgoLC9m/fz+DBg2yJLmSJElSC2dRzavGkSNHGDVqFF5eXhY14Wk0Gl599VVmz56N\nEILo6GgCAgL44IMPCAoKIiIiglOnTrFgwQKKior49ddf+eijj9i4cSOurq7MmzePBx54AJVKxYIF\nC3BxcbnhE5UkSZJaDpWwoGNq1qxZ+Pn5sW/fPn744QccHR2ZNGkSGzduvB1pvC6yGaCabBKpJfOi\nlsyLWjIvarXYZsP33nuPzp07s2zZMlxdXcnMzGTWrFlNnTZJkiRJMsuiZsPWrVszc+ZM5bO/vz/+\n/v5NlSZJkiRJalSjwSssLKzRvq0DBw7c8gRJkiRJ0rU0Gry+//57ANatW0dBQQFTp05FCMH333+P\nl5fXbUmgJEmSJP1Wo8GrZlqnI0eO8OWXXyrfv/LKKzz88MM8/vjjTZs6SZIkSTLDogEbWVlZ5OXl\nKZ/z8vLIzs5uskRJkiRJUmMsGrDxyCOPEBUVxbBhwwDYvXs3TzzxRFOmS5IkSZIaZFHweuihh+jX\nrx9HjhxBCMFDDz1Et27dmjptkiRJkmSWxTNsBAYGEhgY2JRpkSRJkiSLWBS8jh8/zrvvvktKSgoG\ngwEhBCqVSg6VlyRJku4Ii4LXyy+/zLx58+jTpw9qtUVjPCRJkiSpyVgUvOzs7Lj//vubOi2SJEmS\nZBGLqlFDhgxh9+7dTZ0WSZIkSbKIRTWvb775hhUrVuDo6IiNjY3s85IkSZLuKIuCV800UZIkSZJ0\nN7AoePn5+aHX60lKSkKlUtGhQwesrK7r71hKkiRJ0i1jUQQ6deoUf/7zn5UmQ71ez4cffkiPHj2a\nOn2SJEmSVI9FwWvx4sUsWbKE8PBwAA4ePMiiRYtYu3ZtkyZOkiRJksyxaLRheXm5Erig+u98lZeX\nN1miJEmSJKkxFgUve3t7Dh48qHw+fPgw9vb2TZYoSZIkSWqMRc2GL730Ek899RQ2NjYAVFVV8cEH\nH1h0gJiYGJYsWYIQggceeIA5c+aYLNfpdDz//POcOXOGVq1asWzZMnx9fdHr9bzyyiucOXMGo9HI\nxIkT620rSZIk/T5ZFLx69erF9u3bSUpKQghBp06dsLa2vuZ2RqORRYsWsWbNGjw9PYmOjmbEiBEE\nBAQo66xbtw5XV1e2b9/Oli1bePfdd1m2bBnbtm2jqqqKjRs3UlFRwbhx4xg/fjy+vr43fraSJElS\ni2BRs+H+/fupqKiga9eudOvWjfLycoteUI6Li6N9+/b4+flhbW1NZGQkO3bsMFlnx44dTJo0CYAx\nY8YozZMqlYqysjIMBgPl5eXY2Njg5OR0vecnSZIktUAWBa933nnHJHA4OTnxzjvvXHM7rVaLj4+P\n8tnLy4usrCyTdbKysvD29gZAo9Hg7OxMQUEBY8aMwd7enkGDBjF8+HAeffRRXFxcLDopSZIkqWWz\nqNmwZjqoGmq1GoPBYNF217tOzbHi4uLQaDTs27ePgoIC/vjHPxIeHo6/v78lSZYkSZJaMIuCl6Oj\nI7GxsfTu3RuA2NhYHBwcrrmdt7c36enpymetVounp2e9dTIzM/Hy8sJgMFBSUoKrqyubNm1i8ODB\nqNVqWrduTXBwMKdPn75m8PLwcLbklH4XZF7UknlRS+ZFLZkXzZdFwevZZ59l/vz5dO7cGYD4+Hg+\n+uija24XFBREcnIyaWlpeHh4sHnzZpYuXWqyTkREBBs2bKB3795s27aNsLAwAHx8fDh48CATJkyg\nrKyM2NhYZs6cec1jZmcXW3JKLZ6Hh7PMi6tkXtSSeVFL5kWt5hjEVcKStj2gsLCQkydPIoSgb9++\nuLq6WnSAmJgYFi9ejBCC6Oho5syZwwcffEBQUBARERHodDqeffZZzp07h5ubG0uXLsXf35+ysjJe\nfNEYr50AABg9SURBVPFFEhISAHjggQeYNWvWNY8nf4zV5I1ZS+ZFLZkXtWRe1GrRwSspKYmEhARG\njhxJaWkpVVVVuLm5NXX6rpv8MVaTN2YtmRe1ZF7UknlRqzkGL4tGG27YsIEnn3ySf/zjH0B139Vf\n/vKXJk2YJEmSJDXEouD1+eef8/333+PsXB2dO3XqRE5OTpMmTJIkSZIaYlHwsra2xtHR0eQ7jUbT\nJAmSJEmSpGuxKHi5ubkpf4gS4Mcff1ReLJYkSZKk283iiXmfeeYZkpKSGD58OHZ2dnzyySdNnTZJ\nkiRJMsui4NWxY0e+++47Ll++jBCCjh07ymZDSZIk6Y6xqNkwKSkJvV5PQEAAGRkZrFq1isLCwqZO\nmyRJkiSZZVHw+stf/oJarSYlJYXXX3+dlJQUnn/++aZOmyRJkiSZZVHwUqvVWFtbs3v3bqZNm8ai\nRYvIyMho6rRJkiRJklkWBa/Kykq0Wi07d+5U5h60cGIOSZIkSbrlLApejzzyCJGRkTg6OhIUFERK\nSorywrIkSZIk3W4Wz21Yl8FgwGAwYGNj0xRpuilyrrJqct62WjIvasm8qCXzolaLm9vw9OnTZr/X\naDTY2Nig0+mUWd8lSZIk6XZp9D2vFStWUF5ezvjx4+nduzfu7u5UVlaSlJTEnj172L17Ny+88AIB\nAQG3K72SJEmS1Hjw+vDDD4mLi+Obb77h3//+N5mZmdjb29O1a1dGjhzJV199hZOT0+1KqyRJkiQB\nFsyw0atXL3r16nU70iJJkiRJFrFotKEkSZIk3U1k8JIkSZKaHRm8JEmSpGZHBi9JkiSp2bEoeOXm\n5vLXv/6Vhx56CIDz58/zf//3f02aMEmSJElqiEXB65VXXqFfv34UFRUB0KlTJ77++muLDhATE8PY\nsWMZM2YMK1eurLdcp9Px9NNPM3r0aKZOnUp6erqy7Pz58zz44IOMHz+eCRMmoNPpLDqmJEmS1LJZ\nFLy0Wi3Tpk1T/gCljY0NavW1NzUajSxatIhVq1axadMmNm/eXG9GjnXr1uHq6sr27dt55JFHePfd\nd4HqKaiee+453njjDTZt2sQXX3yBtbX19Z6fJEmS1AJZFLysrExfBysqKrJoVvm4uDjat2+Pn58f\n1tbWREZGsmPHDpN1duzYwaRJkwAYM2YMBw8eBGDv3r0EBgbStWtXAFxdXVGpVJYkV5IkSWrhLApe\no0eP5rXXXqO0tJT169cze/ZsHnjggWtup9Vq8fHxUT57eXmRlZVlsk5WVhbe3t5A9ZyJzs7OFBT8\nf3v3HhxVef9x/L1sAlJMgpiQRaS0JraQGqAzKsERIYBZIITsBiIMUsKlpdoBKqFYwck4crXGyUhk\nOhIBKzRMa4HIJRBSgxI6XGy1hZkCRUEn3JJwS5NgypLN8/sjP3YbgrBWNvEkn9df7Nlnz373yzN8\nOGfPPqeKL774AoAZM2aQlpbG6tWrA/1MIiLSxt12hQ2An/70p2zdupXq6mr27NnDT37yE1JTU2/7\nukCOzm4cY4zBZrPh9Xr55JNP2LRpE506dWLq1Kk89NBDvvuJiYhI+xVQeAGMHTuWsWPHfq2dOxyO\nJhdgVFRU0L1792ZjysvLiY6Oxuv1UltbS0REBA6Hg0ceeYSIiAgAnnjiCY4cOXLb8LLi0v7Bol74\nqRd+6oWfemFdAYXXxYsX+f3vf09ZWRn19fW+7StWrLjl6+Lj4ykrK+PMmTNERUVRWFhITk5OkzGJ\niYkUFBTQv39/ioqKfOH0+OOPs3r1aq5evYrdbuevf/0rU6dOvW2tuj9PI92ryE+98FMv/NQLPyuG\neEDh9Ytf/IK4uDgGDRrku+IwEHa7naysLKZPn44xhvHjxxMTE0Nubi7x8fEkJiaSnp7O/PnzSUpK\nomvXrr5wCw8PZ9q0aYwbNw6bzcbQoUMZMmTI//YpRUSkTQnoTspjx45l69atLVHPN6b/STXS/yr9\n1As/9cJPvfCz4pFXQFcb9u/fn3/961/BrkVERCQgAZ02nDhxIpMnT8bhcNCpUyff9o0bNwatMBER\nka8SUHjNnz+fZ555hri4uK/1nZeIiEgwBBRenTp1YsaMGcGuRUREJCABfec1ePBgSktLg12LiIhI\nQAI68nr33XfJy8ujS5cudOzY0bcKxv79+4Ndn4iISDMBhdemTZuCXYeIiEjAAgqvnj17BrsOERGR\ngN0yvObPn092drZvlYsb6VJ5ERFpDbcMr4yMDAB+/etft0gxIiIigbhleG3YsIFly5bx6KOPtlQ9\nIiIit3XLS+WPHj3aUnWIiIgELKDfeYmIiHyb3PK04fHjxxk0aFCz7fqdl4iItKZbhtf3vvc98vLy\nWqoWERGRgNwyvDp27KjfeImIyLfOLb/zCg0Nbak6REREAnbL8Hr33Xdbqg4REZGA6WpDERGxHIWX\niIhYjsJLREQsJ+jhVVpaysiRI3E6nTe97N7j8TB37lySkpKYMGECZ8+ebfL82bNn+fGPf8zbb78d\n7FJFRMQighpeDQ0NLF68mDVr1rB9+3YKCws5ceJEkzEbN24kIiKC4uJiMjIyyM7ObvL8K6+8wpAh\nQ4JZpoiIWExQw+vw4cP07t2bnj17EhoaSnJyMiUlJU3GlJSU4Ha7AXA6nU1W7Xj//ffp1asXsbGx\nwSxTREQsJqjhVVFRQY8ePXyPo6OjqaysbDKmsrISh8MBgN1uJzw8nKqqKurq6li9ejWzZs0KZoki\nImJBAd1J+X9ljPnaY66vm5ibm8vUqVPp3LlzwPsCiIoK+/qFtlHqhZ964ade+KkX1hXU8HI4HE0u\nwKioqKB79+7NxpSXlxMdHY3X66W2tpaIiAgOHz5McXEx2dnZVFdX06FDBzp16sTTTz99y/c8f74m\nKJ/FaqKiwtSL/6de+KkXfuqFnxVDPKjhFR8fT1lZGWfOnCEqKorCwkJycnKajElMTKSgoID+/ftT\nVFREQkICAPn5+b4xK1eupEuXLrcNLhERaR+CGl52u52srCymT5+OMYbx48cTExNDbm4u8fHxJCYm\nkp6ezvz580lKSqJr167Nwk1ERORGNhPol0kWodMAjXRKxE+98FMv/NQLPyueNtQKGyIiYjkKLxER\nsRyFl4iIWI7CS0RELEfhJSIilqPwEhERy1F4iYiI5Si8RETEchReIiJiOQovERGxHIWXiIhYjsJL\nREQsR+ElIiKWo/ASERHLUXiJiIjlKLxERMRyFF4iImI5Ci8REbEchZeIiFiOwktERCxH4SUiIpYT\n9PAqLS1l5MiROJ1O8vLymj3v8XiYO3cuSUlJTJgwgbNnzwKwb98+0tLSGDt2LOPGjePAgQPBLlVE\nRCwiqOHV0NDA4sWLWbNmDdu3b6ewsJATJ040GbNx40YiIiIoLi4mIyOD7OxsALp168aqVavYunUr\nr7zyCs8//3wwSxUREQsJangdPnyY3r1707NnT0JDQ0lOTqakpKTJmJKSEtxuNwBOp5P9+/cD0KdP\nH6KiogB48MEH8Xg8XLt2LZjlioiIRQQ1vCoqKujRo4fvcXR0NJWVlU3GVFZW4nA4ALDb7YSHh1NV\nVdVkTFFREXFxcYSGhgazXBERsYiQYO7cGPO1xxhjsNlsvseffvopOTk5rF27NqD3jIoK+3pFtmHq\nhZ964ade+KkX1hXU8HI4HL4LMKDxSKx79+7NxpSXlxMdHY3X66W2tpaIiAgAysvLmTVrFq+++ir3\n339/QO95/nzNnfsAFhYVFaZe/D/1wk+98FMv/KwY4kE9bRgfH09ZWRlnzpzB4/FQWFjI8OHDm4xJ\nTEykoKAAaDw9mJCQAEB1dTU///nP+dWvfsWAAQOCWaaIiFhMUMPLbreTlZXF9OnTGTNmDMnJycTE\nxJCbm8sHH3wAQHp6OpcvXyYpKYl33nmHefPmAZCfn09ZWRm//e1vcblcuN1uLl26FMxyRUTEImwm\nkC+mLESnARrplIifeuGnXvipF346bSgiItICFF4iImI5Ci8REbEchZeIiFiOwktERCxH4SUiIpaj\n8BIREctReImIiOUovERExHIUXiIiYjkKLxERsRyFl4iIWI7CS0RELEfhJSIilqPwEhERy1F4iYiI\n5Si8RETEchReIiJiOQovERGxHIWXiIhYTtDDq7S0lJEjR+J0OsnLy2v2vMfjYe7cuSQlJTFhwgTO\nnj3re27VqlUkJSUxatQo/vKXvwS7VBERsYighldDQwOLFy9mzZo1bN++ncLCQk6cONFkzMaNG4mI\niKC4uJiMjAyys7MB+Oyzz9i5cyc7duzgrbfe4uWXX8YYE8xyRUTEIoIaXocPH6Z379707NmT0NBQ\nkpOTKSkpaTKmpKQEt9sNgNPp5MCBAwDs3r2b0aNHExISwv3330/v3r05fPhwMMsVERGLCGp4VVRU\n0KNHD9/j6OhoKisrm4yprKzE4XAAYLfbCQsLo6qq6qavraioCGa5IiJiEUENr0BO891sjM1m+8rt\nIiIiIcHcucPhaHIBRkVFBd27d282pry8nOjoaLxeLzU1NUREROBwODh37pxvXHl5ebPX3kxUVNid\n+wAWp174qRd+6oWfemFdQT3yio+Pp6ysjDNnzuDxeCgsLGT48OFNxiQmJlJQUABAUVERCQkJAAwb\nNowdO3bg8Xg4deoUZWVl9OvXL5jlioiIRQT1yMtut5OVlcX06dMxxjB+/HhiYmLIzc0lPj6exMRE\n0tPTmT9/PklJSXTt2pWcnBwAYmNjGTVqFMnJyYSEhPDSSy/ptKGIiABgM7r+XERELEYrbIiIiOUo\nvERExHIUXiIiYjltJrxut4ZiW1ZeXs6UKVMYPXo0KSkprFu3DoB///vfTJ8+HafTyYwZM6ipqWnl\nSltOQ0MDbrebZ555BoDTp0/z1FNP4XQ6yczMpL6+vpUrbBk1NTXMmTPHd/HToUOH2u28+N3vfseY\nMWNISUlh3rx5eDyedjMvFi5cyGOPPUZKSopv263mwZIlS0hKSiI1NZWjR4+2Rsm31SbCK5A1FNsy\nu93OggUL2LFjB3/4wx/Iz8/nxIkT5OXlMWjQIHbt2sXAgQNZtWpVa5faYtatW0dMTIzv8Wuvvca0\nadPYtWsXYWFhbNy4sRWrazlLly5lyJAh7Ny5ky1btvDAAw+0y3lRUVHB+vXr2bx5M9u2bcPr9VJY\nWNhu5kVaWhpr1qxpsu2r5sGePXsoKyujuLiYRYsW8dJLL7VGybfVJsIrkDUU27KoqCj69u0LQJcu\nXYiJiaGioqLJupFut5v333+/NctsMeXl5ezZs4f09HTftgMHDuB0OoHGXvz5z39urfJaTG1tLX/7\n298YN24cACEhIYSFhbXbedHQ0EBdXR319fX85z//oXv37hw8eLBdzIuHH36Y8PDwJttunAfX/80s\nKSnB5XIB0L9/f2pqarhw4ULLFhyANhFegayh2F6cPn2aY8eO0b9/fy5evEhkZCTQGHCXL19u5epa\nxrJly3j++ed9vwu8fPkyERERdOjQON0dDke7mB+nT5/mnnvuYcGCBbjdbrKysqirq2uX8yI6Oppp\n06YxdOhQnnjiCcLCwoiLiyM8PLzdzYvrLl261GQeXLp0CWi63ix8e9eVbRPhpZ+qNbpy5Qpz5sxh\n4cKFdOnSpV3+qPvDDz8kMjKSvn37+uaFMabZHGkPvamvr+fIkSNMmjSJgoICOnfuTF5eXrv47Deq\nrq6mpKSEDz74gL1791JXV0dpaWmzce2xNzeyyrqyQV1ho6UEsoZiW1dfX8+cOXNITU1lxIgRANx7\n771cuHCByMhIzp8/T7du3Vq5yuD75JNP2L17N3v27OHq1atcuXKFZcuWUVNTQ0NDAx06dAh4nUyr\nczgcOBwO4uPjAUhKSuKtt95ql/Ni37599OrVi65duwIwYsQI/v73v1NdXd3u5sV1XzUPoqOjKS8v\n9437tvalTRx5BbKGYlu3cOFCYmNjycjI8G0bNmwYmzdvBqCgoKBd9CQzM5MPP/yQkpIScnJyGDhw\nIK+99hoDBw6kqKgIaD+9iIyMpEePHnz++edA4/d+sbGx7XJe3HfffRw6dIirV69ijOHAgQM8+OCD\n7Wpe3HhE9VXzYPjw4bz33nsA/OMf/yA8PNx3evHbpM0sD1VaWsrSpUt9ayjOnDmztUtqMR9//DGT\nJ0/mBz/4ATabDZvNxty5c+nXrx/PPfcc586d47777mPFihXNvrRtyz766CPWrl3Lm2++yalTp8jM\nzKS6upq+ffuSnZ1NaGhoa5cYdMeOHePFF1+kvr6eXr16sXz5crxeb7ucFytXrqSwsJCQkBDi4uJY\nsmQJ5eXl7WJezJs3j4MHD1JVVUVkZCSzZ89mxIgR/PKXv7zpPFi0aBF79+6lc+fOLF++nB/96Eet\n/AmaazPhJSIi7UebOG0oIiLti8JLREQsR+ElIiKWo/ASERHLUXiJiIjlKLxERMRyFF5iScOGDeOz\nzz5rkfdauXJlk1tlLFiwgPz8/G+83wULFpCSkkJmZuY33tetHDt2jJ07dwb1PURamsJL5DZWrlzJ\ntWvX7ug+L1y4QHFxMdu2bSMnJ+eO7vtGR44c+Z/Dq6Gh4Q5XI3JnKLykTfn888/52c9+Rnp6Oi6X\ny7f8DUCfPn1YtWoV48eP58knn6S4uNj33K5duxg1ahRpaWmsWrWKPn36UFdXx6JFi7DZbEycOBG3\n201tbS0Ax48fJyMjA6fTyQsvvPCV9bz33nukpKSQmprK7NmzuXTpEleuXCEjI4OrV6/idrt55513\nmrxmy5YtzJo1y/fY6/UyePBg3/qdq1ev5qmnniItLY1nn32WixcvAnDt2jV+85vfkJKSgsvlYvbs\n2VRVVfHGG29w4MAB3G43S5cuBRpXpHG73aSmpjJt2jROnToFNK5K4nK5WLJkCRMnTmTv3r3f5K9D\nJHiMiAUlJiaaTz/9tMm2+vp643a7zcmTJ40xxtTW1hqn0+l7/MMf/tDk5+cbY4z5+OOPzeDBg40x\nxly4cME8+uijpqyszBhjzNtvv2369OljvvzyS9/r6urqfO/zwgsvmEmTJhmPx2M8Ho9JTk42+/bt\na1bj8ePHzeOPP24uXLhgjDHm9ddfN88995wxxpjTp0+bhISEm362uro6k5CQYC5fvmyMMWb37t0m\nIyPDGGPMli1bTFZWlm/shg0bzLx584wxxrzxxhtm9uzZpr6+3hhjfK/fvHmzmTNnju81Fy9eNAkJ\nCebEiRPGGGP+9Kc/mfT0dGOMMQcPHjRxcXHm0KFDN61N5NtCR17SZnzxxRecPHmSzMxMXC4XTz/9\nNNeuXWtyV+3Ro0cDMGDAAM6fP4/H4+HQoUM89NBD9OrVC4Dx48c327e5YRW1ESNGEBoaSmhoKHFx\ncZSVlTV7zcGDBxk6dCj33nsvABMnTmTfvn23/Rx33XUXw4cPZ/v27UDjoqnXbyi5e/du9u/fj8vl\nwuVysWHDBs6dOwc03g5mypQp2O12AN8K6jc6dOgQffv25YEHHgBg3LhxHD16lC+//BKA3r17069f\nv9vWKdKa2sQtUUSgMWC6detGQUHBTZ+32Wx06tQJwHcDQq/X2yyYbnx8Mx07dvT92W63N7mg47/3\nc+N9kK6/7+24XC6WL1/OmDFj+Oijj8jOzvbt89lnnyUtLe2m7xeIm9X134+/853vBLQfkdakIy9p\nM77//e9z1113sWXLFt+2kydPcuXKFaD5P+7XHw8YMIB//vOfvu99/vt7MoC7776bmpqar13PoEGD\n2LNnj+87qT/+8Y889thjzd7/Zh5++GFqa2vJycnhySef9IXusGHD2LBhA9XV1QB4PB6OHTsGQGJi\nIuvWrfNdXHL9Dsl3332377u665/36NGjvlulbN68mbi4OIWWWIqOvMSSbDYbU6dOJSQkxHcksW3b\nNt58802WLl3K2rVr8Xq9REZG8vrrr/tec+M+oPGmfC+//DIzZ87knnvuYejQoYSEhNC5c2cApk2b\nxpQpU+jcuTPr168PuMbY2FgyMzOZOnUqHTp0oFevXixatKjZ+38Vl8tFbm4uGzZs8G1LTU2lqqqK\nyZMnY7PZaGhoYNKkSfTp04eZM2eSk5ODy+WiY8eOfPe732XFihUMGjSINWvW4HK5eOSRR3jxxRd5\n9dVXmTdvHl6vl27duvmO7ESsQrdEEQGuXLlCly5dgMYjkU2bNt2R33KJSHDoyEsEWL9+PUVFRXi9\nXrp27crixYtbuyQRuQUdeYmIiOXogg0REbEchZeIiFiOwktERCxH4SUiIpaj8BIREctReImIiOX8\nH4gzFtcS9o9MAAAAAElFTkSuQmCC\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b20dd690\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(graph_means)\n",
+        "plt.ylabel('Time (seconds)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (vectorized TF operation)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "4KZg2WXjbhg5"
+      },
+      "source": [
+        "## AutoGraph"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "UQJBQWbCbinm"
+      },
+      "outputs": [],
+      "source": [
+        "# Sum written using for loop and converted with AutoGraph\n",
+        "def sum_all(elements):\n",
+        "  sum_ = 0.0\n",
+        "  length = len(elements)\n",
+        "  for i in tf.range(length): \n",
+        "    sum_ += elements[i][0]\n",
+        "  return sum_\n",
+        "\n",
+        "def run_trial(num):\n",
+        "  elements = get_elements(num)\n",
+        "  return sum_all(elements)\n",
+        "    \n",
+        "ag_means = []\n",
+        "ag_run_trial = ag.to_graph(run_trial)\n",
+        "\n",
+        "for num in range(max_elements):\n",
+        "  with tf.Graph().as_default():\n",
+        "    durations = []\n",
+        "    foo = ag_run_trial(num)\n",
+        "    with tf.Session() as sess:\n",
+        "      for _ in range(burn_ins):\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "        \n",
+        "      for _ in range(trials):\n",
+        "        start = time.time()\n",
+        "        for _ in range(batches):\n",
+        "          sess.run(foo)\n",
+        "      \n",
+        "        duration = time.time() - start\n",
+        "        durations.append(duration)\n",
+        "    ag_means.append(np.mean(durations))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 310,
+          "status": "ok",
+          "timestamp": 1532448438694,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "DLDOmrRW99v5",
+        "outputId": "ae0e0573-39db-4004-a064-efc618dbf867"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEcCAYAAADUX4MJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XdYVFf++PH3DE1AinTEjgULioggNiTYjcZCoiZqjEmM\n6RuzcVc32exPE7O72dTNmmhi1u+abLJqLFFsib1E7GLDjkgbQUSKyDAz5/eHySARcYwMQ/m8nsfn\nce4999zPPXOHzz23nKtRSimEEEKIu9DaOgAhhBC1gyQMIYQQFpGEIYQQwiKSMIQQQlhEEoYQQgiL\nSMIQQghhkTqZMObPn88bb7xh6zBqlQceeICffvrJ6uuZOXMmH330kdXXU1NcuHCBUaNG0a1bN776\n6itbh1OnnT17ljFjxtg6jGpV2e9Jr9czZMgQcnNzq2x9tTJhdO3alfDwcMLDw2nfvj1dunQxT1uz\nZg3PPPMMc+bMsXoc6enphISEYDKZrL6uqlTf/mjfjTW/xy+++IKoqCgOHDjAhAkTbpu/bt06xo0b\nR1hYGJMmTbpt/smTJxk9ejRhYWGMGTOG5OTkcvPfffddoqKi6NGjB+++++49LWttEydOZNmyZdW2\nvo8//pinnnqqwjgiIyMpLS29p/pCQkK4dOnSPS3z1VdfMWLECMLCwujduzeTJk1i7dq191RHVXF0\ndCQ+Pp7PP/+8yuqslQnj0KFDHDx4kIMHD9K4cWPmz59vnvbggw9WWxxKKTQaDfLsY+1mze8xIyOD\n1q1b33G+p6cnkydPZurUqbfNKy0t5fnnn2fkyJHs27ePkSNH8txzz2EwGAD49ttv2bx5M6tXr+b7\n779n69at/O9//7No2drgXr6P7OxsEhMTiYuLKzc9PT2dAwcOoNFo2Lx58z2tX6PR3FP5OXPmsHjx\nYmbOnMnevXvZsWMHv/vd79ixY8cdl7H2344HH3yQFStW3HOyvJNamTBupZS6rdE/+eQTXnvtNaDs\n6HH58uX069ePqKgovv32W44ePcqIESOIjIy8rTeybNkyhg4dSlRUFE899RQZGRkVrnvixIkARERE\nEB4ezpEjR1BKMW/ePB544AF69erFH//4RwoLCytc/urVq0ybNo3u3bsTFRVV7gj010c3t/YK9u7d\nS0xMDF988QU9e/akT58+/Pjjj2zbto1BgwYRFRXF/PnzK1znkiVLWL16NV988QXh4eE8++yz5nkn\nT55kxIgRdO/enenTp6PX683ztmzZwsiRI+nevTvjx4/n1KlTFdYPcO7cOaZMmUJUVBRDhgxh3bp1\ndyxbWb0PPPAACxcuZMSIEXTt2pXXX3+dK1eu8PTTTxMeHs6UKVMoKCgwlz98+DDjxo2je/fujBw5\nkr1795rnTZw4kY8++ojx48cTHh7Ok08+SV5ennkelP8eU1NTmThxIhEREURHRzN9+vQ7bsOmTZt4\n8MEHiYyMZNKkSZw/fx6Axx9/nMTERGbPnk14eDgXL168bdno6GgGDx6Mr6/vbfP27t2L0Whk0qRJ\nODg4MHHiRJRS7NmzB4CVK1cyZcoU/Pz88PPz44knnmDFihUAJCYmVrrsrdauXXvbqZxFixbx3HPP\nATdPbfztb38jNjaW3r1785e//KXcvvHjjz8ycuRIunXrxsCBA9m5cycffPABBw4cYM6cOYSHh/PW\nW28BcPDgQeLj4+nevTsPP/wwhw4dKvcdffDBB4wfP56wsDDS0tJYvnw5/fv3Jzw8nP79+7NmzZoK\nv4Ndu3bRsWNHHB0dy01fuXIlYWFhjB492tw2t67v1h7QihUrePTRRwGYMGECSilGjBhBeHi4eR9e\nsmQJAwcOJCoqiueee47Lly8DN089fvPNN3zwwQdER0fj6OiIRqMhPDycd955567bOHToUMLDwxkw\nYIA56f+yD8TExDB//nx69OhBXFwcq1evLrcd165d45lnniE8PJyxY8eW+7vh7++Ph4cHR44cqbDd\n7pmq5WJjY9Xu3bvLTfvnP/+pXnvtNaWUUmlpaapdu3bqzTffVCUlJWrXrl0qNDRUPf/88yo3N1dl\nZWWp6OhotW/fPqWUUj/88IMaOHCgOn/+vDIajerTTz9VY8eOrXDdaWlpKiQkRJlMJvO0pUuXqoED\nB6q0tDR1/fp19cILL5hj+bX33ntPvfnmm8poNCqDwaD2799vnhcSEqJSU1PNn//4xz+qDz/8UCml\nVGJiourQoYOaN2+eMhgMasmSJapHjx7q1VdfVdevX1dnzpxRoaGh6tKlSxWu99a6bm3Hhx9+WGVn\nZ6tr166pIUOGqG+//VYppdSxY8dUdHS0SkpKUiaTSa1YsULFxsYqvV5/W93Xr19XMTExasWKFcpk\nMqkTJ06oqKgodfbs2dvWfbd6Y2Nj1dixY9WVK1eUTqdT0dHRatSoUerkyZNKr9erSZMmqU8++UQp\npVRWVpaKjIxU27dvV0optXv3bhUZGalyc3OVUkpNmDBBDRgwQF28eFGVlJSoCRMmqPfee++O3+P0\n6dPVZ599ppRSqqSkRB04cKDCtjx//rwKCwtTu3fvVgaDQX3++edqwIABqrS01LzepUuXVrjsrZYs\nWaImTpxYbtq///1v9fTTT5eb9swzz6h///vfSimlunXrpo4cOWKed/ToURUeHm7RsrcqLi5W4eHh\n6uLFi+ZpY8aMUWvXrlVKKfXWW2+pZ599VuXn56uioiI1bdo09f777yullDpy5Ijq1q2b+Teo0+nU\n+fPnK9z2vLw81b17d/X9998ro9Go1qxZo7p3767y8vLM5WNjY9XZs2eV0WhUBQUFKjw8XKWkpCil\nlMrOzjbvR7/2t7/9Tc2ePfu26QMGDFDffPONOnbsmOrYsaO6cuWKed6v41u+fLl69NFHzZ/btWtX\n7je4e/duFRUVZd7/5syZox577DGllFLffPONeuCBByqM7Va/3sbS0lK1detW82913759qkuXLurE\niRNKqbLf+l//+lel1+vV3r17VVhYmLpw4YJS6ubvKTIyUh09elQZjUb16quvqunTp5db57Rp09Ti\nxYvvGpslan0PwxIajYbnn38eR0dHevbsibOzM8OGDaNRo0b4+/sTERHBiRMnAPjf//7H1KlTadmy\nJVqtlqlTp5KcnExmZuYd61e39HDWrFnD5MmTCQoKwtnZmenTp7N27doKz4/b29uTnZ1NWloadnZ2\ndOvWrcI6K+Lg4MC0adOws7Nj6NChXL16lccffxxnZ2dat25N69atK+0FVGTSpEn4+Pjg7u5ObGws\nJ0+eBGDp0qWMGzeO0NBQNBoNI0eOxNHRscKjli1bttCkSRNGjhyJRqOhffv2DBw4kPXr199W1pJ6\nJ0yYgJeXF35+fkRERNClSxdCQkJwcHBgwIAB5hi///57+vXrR58+fYCbR+6dOnVi27Zt5rpGjx5N\ns2bNcHR0ZMiQIeZlf3Frm9vb25Oeno5Op8PR0ZHw8PAK22zdunX069eP6Oho7OzsePLJJ7lx40a5\nI+ff6vr167i5uZWb1rBhQ3OP9dfz3dzcuH79ukXL3qpBgwbExcWZj95TUlK4cOGC+fTOsmXLmDlz\nJm5ubri4uDB16lRz2WXLlhEfH090dDQAfn5+tGzZssLt2bp1Ky1atGD48OFotVqGDRtGq1at2LJl\ni7nMqFGjCA4ORqvVYmdnh52dHadPn6akpAQfHx+Cg4MrrLugoABXV9dy0/bv309GRgZDhgyhY8eO\nNGvW7Laj83uxZs0a4uPjzfvf9OnTOXz4MBkZGVy9evW2XmJMTAzdu3enc+fO5f5+3LqN9vb2xMTE\n0KRJE+BmL7dXr17s37/fXF6j0fC73/0OBwcHunfvTkxMTLle+8CBA+nUqRNarZbhw4fftl+7urqS\nn5//m7f7VvZVUkst4O3tbf5/gwYN8PHxMX92cnIy/9AyMjJ4++23+dvf/gaUnd/W6XQEBgbedT2X\nL1+mcePG5s9BQUEYDAZycnLw8/MrV/app57in//8J1OmTEGj0fDwww9XeC67Ip6enuZzrA0aNKhw\nG3/ZJkvduryzszPZ2dnAzTZZtWqV+S4fpRQGg8HcHb9VRkYGhw8fJjIy0lzWaDQycuTICsverd5b\nY3Jycrrt863f27p168x/fH6p65c/ZEC579zZ2bnS9pkxYwYffvgh8fHx5usMFd2B8+vvW6PREBgY\niE6nu2PdlnJxcbntD3xhYSENGzascH5hYSEuLi4WLftrw4YN4+9//zvPPfcca9asoX///jg6OpKb\nm0txcXG5bTeZTObkmpWVRUxMjEXb8+u2AmjcuHG5tgoICDD/39nZmQ8++ICFCxcya9YsunXrxowZ\nM2jVqtVtdbu7u1NUVFRu2qpVq+jduzceHh7mbVy5ciWPP/64RfFWFH/Hjh3Nn11cXPD09ESn0+Hp\n6Xnb72Hbtm0YjUY6depU7mDk1m38pdy8efNISUnBZDJx48YN2rVrV27bnJyczJ8bN25cbl1326+L\niopwd3f/Tdv8a/UmYVgqICCAZ5991qKL5xVdFPPz8yt3zSM9PR17e/tyX+ovXFxc+MMf/sAf/vAH\nzp07x8SJE+ncuTM9evTA2dmZ4uJic9ns7OzbdrTqEhAQwLRp03jmmWfuWjYwMJCoqCgWLlxYpfVa\nst6RI0cye/bse162ou/R29vbfG3rwIEDPPHEE0RGRtK0adNy5fz8/Dhz5ky5aZmZmVXyXbVp04ZF\nixaVm3b69GnzNZfWrVuTnJxMaGgocPMaVJs2bSpdtqI7tQB69+7NzJkzSU5OJiEhgVmzZgHQqFEj\nnJ2dWbNmzW0HPHDzO7zTnUS/blc/Pz82btxYblpGRgZ9+/a94zK9evWiV69e6PV6PvjgA9544w2+\n/vrr29bVrl07Vq1aZf5cUlLCunXrMJlM9O7dG7h5I0B+fj6nTp2iXbt2uLi4cOPGDfMyvxwg3cmv\nf9vXr18nLy8Pf39/PD09eeuttzh+/Hi5pAK3ny24dRv1ej0vv/wy7777LnFxcWi1Wp5//vlyy+Tn\n53Pjxg3zgWFmZiZt27atNNZbnT9/nieffNLi8pWpF6ek7nZ651bjx49n/vz5nD17FrjZ1a3odAqA\nl5cXWq2W1NRU87Rhw4axaNEi0tLSKCoq4oMPPmDYsGFotbc39datW83Luri4mLvgcPOi95o1azCZ\nTGzfvp19+/ZZvA134+Pjc0+3Cz7yyCN8++23JCUlATd/KNu2bavwCL1fv35cuHCBVatWYTAYKC0t\n5ejRo+YLwb+13rsZMWIEmzdvZufOnZhMJkpKSti7d69FR/oVfY/r1683L+vu7o5Wq63wOxwyZAhb\nt25lz549GAwGFi5ciJOTE2FhYRbFbTKZ0Ov1GAyGcv8HiIyMRKvVsnjxYvR6vbknFhUVBcDIkSNZ\ntGgROp0OnU7HokWLGD16dKXL9ujRo8I47OzsGDRoEH//+9/Jz8+nV69eAOae79y5c8338+t0Onbu\n3AlAfHw8y5cvZ8+ePSil0Ol05u/61/tZTEwMFy9eJCEhAaPRyNq1azl//jyxsbEVxnTlyhU2b95M\ncXEx9vb25t9IRXr16sXx48fNF+N/+OEH7OzsWLduHatWrWLVqlWsXbuWbt26sXLlSuDmb2zjxo3c\nuHGDixcv8t1335Wr89fxP/jggyxfvpzk5GT0ej3vv/8+Xbp0oXHjxrRs2ZKxY8cyffp0du/eTUlJ\nCSaTiYMHD1Z6t1VpaSmlpaU0atQIrVbLtm3b2LVrV7kySik+/vhjSktL2b9/P1u3bmXIkCF3rPNW\nOp2Oa9eu0aVLF4vK302tTxiW3Pr26zKVfe7fvz9PP/00r7zyChEREYwYMeKOt8U1aNCAadOmMX78\neCIjI0lKSiI+Pp6HHnqICRMmMGDAAJydnXn99dcrXD4lJYXJkyfTtWtXxo8fz2OPPUb37t0B+NOf\n/sTmzZvp3r07CQkJ9O/f/7628Vbx8fGcPXuWyMhIXnjhhbuW79SpE3PmzGH27NlERkYyaNCg2+44\n+YWrqytffvkla9eupU+fPvTp04f33nuv3F01ltZ7L9sUEBDAvHnzmD9/PtHR0cTGxvLll1+aDxYq\nW7ai7/Ho0aM8/PDDhIeH8/zzz/OnP/2JoKCg25Zt2bIl7777LnPmzCE6OpqtW7fy2WefYW9vf9f1\nws3TJp07d2b27NkcOHCALl26mB86dXBwYN68eaxYsYLIyEiWL1/OvHnzzHWPGzeO2NhYRowYwYgR\nI4iNjeWRRx6xaNmKDBs2jJ9++okhQ4aUS46///3vad68OY888ggRERFMmTKFlJQUADp37szcuXOZ\nO3cu3bp1Y9KkSebz9ZMmTWL9+vVERUXx9ttv4+npyWeffcbChQvp0aMHCxcuZP78+eZTRr9uK5PJ\nxL///W/69u1Ljx492LdvH2+++WaFsXt7e9OjRw82bdoE3Lw7asyYMfj7++Pt7W3+99hjj7F69WpM\nJhOTJ0/GwcGBXr16MXPmTIYPH16uzhdffJEZM2YQGRnJ+vXriY6O5uWXX+bFF1+kT58+pKWl8f77\n75vL//nPf2bixIm88847REVFERMTw8cff8yHH35oPhX36210dXXlT3/6Ey+//DKRkZGsXbv2tluD\nfX198fDwoE+fPsyYMYPZs2fTokWLO36Pt1q9ejWjRo3CwcHBovJ3o1H3cvh9j2bNmsXWrVvx9va+\n48WmxMRE3nnnHQwGA40aNWLx4sXWCkcIUYedO3eOP/7xjyxdutTWoVSZvXv3MmPGDLZu3XrPy+r1\nekaOHMlXX32Fl5dXlcRj1YSxf/9+XF1dmTFjRoUJo6CggHHjxvHll1/i7+9Pbm5ulW2YEELUdveT\nMKzBqqekIiIiKr06v3r1agYOHIi/vz+AJAshhKjBbHoNIyUlhWvXrjFx4kTGjBljvhglhBDi5s0L\nNaV3ATa+rdZoNHLixAn+7//+j+vXrzNu3Di6du1K8+bNbRmWEEKICtg0Yfj7+9OoUSOcnJxwcnIi\nIiKC5OTkuyaMXx6mE0IIUX2snjAqu6YeFxfHW2+9hdFoRK/Xk5SUxBNPPHHXOjUaDdnZBXctVx/4\n+rpJW/xM2qKMtEUZaYsyvr5udy9UCasmjFdffZXExETy8vLo168fL774IqWlpWg0GsaOHUtwcDC9\ne/dmxIgRaLVaHnnkkUqHghZCCGE7Vr2t1prkiOEmOXoqI21RRtqijLRFmfvtYdT6J72FEEJUD0kY\nQgghLCIJQwghhEUkYQghhLCIJAwhhBAWkYQhhBB1VFXfBCsJQwgh6qD07EJe+HAHe05kVVmdkjCE\nEKIOWrnjAsUlBlwbVM3Lk0AShhBC1GqLN5zi201nyp1+StUVcOB0Nq0au9OpZdW9NsKmgw8KIYT4\n7c6mX2PLoXQA/Bs5ExveBIBVOy8AMLJ3yyodqFUShhBC1FIb9qYC4Oig5ZtNZ2gR6I5Wo+HQmRyC\ng9zpWIW9C5BTUkIIUStdvnqdg6eyaR7gxgujQjEaFZ+uPMaSLWcBGNmnVZW/BkJ6GEIIUcMopZi3\n8hhXC0ro0MKLTi29aNXYHXu7smP8jfsuoYDBkc3o1MqbB3u2YPXuFHKu3aBNEw86NG9U5XFJwhBC\niBrmXEY+B05lA3A+I581u1Nwc3HgyWHt6RzsQ2FxKTuTMvF2dyIixBeAh3q35Gz6NU5evGqV3gVI\nwhBCiBpn19FMAJ4b2Qk7Ow3HLuSy40gmHy5NYlh0c+y0GvQGEwO6N8NOe7PXodVqeGlMZzKuFNEy\n0N0qcUnCEEIIG7mYVcD5jGv06xpk7hHoS43sPamjkZsT4W190Wo1dG3jS9/Ojfl05TESfroIgLOT\nPX06B5arz8nRzmrJAuSitxBC2IS+1Mi/Vhxl8cbTJJ7UmacfOpNDcYmRnp0C0GrLTis1D3Djz5O7\nE9725imouG5BODtV7zG/9DCEEMIGNuxNJefaDQCWbD5Ll2AfnJ3szaejenYKuG0Zlwb2PD+qE5cu\nF9LEt2G1xgvSwxBCiGqXm3+DhD0XcXdxYGD3puQV6lm9K4WrBSUcT8klOMidQG/XCpfVaDQ083cr\n1/uoLlZNGLNmzaJnz54MHz680nJJSUl06NCBjRs3WjMcIYSoEZZtO4e+1MSYmGBG922Fj0cDfth/\nieXbzqEU9AoNvHslNmDVhDF69GgWLlxYaRmTycR7771Hnz59rBmKEELUCGfTrrHnuI7mAW706hyI\no4Md4/u3wWhS7DqWhb2dlsgQP1uHWSGrJoyIiAjc3Su/Yr948WIGDRqEl1fVPsIuhBA1jUkpvtl0\nGoBH+7dB+/OdUWGtfegc7A1AeFsfXKpwhNmqZNNrGDqdjh9//JHx48fbMgwhhKgWe45ncSGzgMj2\nfrRp4mmertFomDCgLV2CvRkW3cJ2Ad6FTe+Smjt3Lq+99pr5/uOqfjuUEELUFCWlRr7bdh4Hey0P\n92t923wfT2defriLDSKznE0TxrFjx3jllVdQSnH16lW2b9+Ovb09cXFxd13W19etGiKsHaQtykhb\nlJG2KFMT2uLbH05xtaCEh+PaENLa19bh/CZWTxiV9Ro2bdpk/v/MmTOJjY21KFkAZGcX3HdsdYGv\nr5u0xc+kLcpIW5SpCW1xtaCEpZtO4+7iQL/OgTaL534Tp1UTxquvvkpiYiJ5eXn069ePF198kdLS\nUjQaDWPHjrXmqoUQosZYseM8+lIT4+PaVPvT2VXJqpG/9957Fpd95513rBiJEEJUv5JSI9sOpbMr\nKZMmvq706dzY1iHdl9qb6oQQwoYMRhMnUq4S0swTRwe7cvOKSwxsOZTOhr2pFFwvxdFBy4SB7Wzy\ndHZVkoQhhBC/wfJt51m/N5UALxeeHNae4CAPlFLsP5XNf388zbVCPc5OdjzYszkDIpri5uJo65Dv\nmyQMIYS4R5lXivhh/yWcnezR5V5n7lcH6N+tKZm5RRw7n4u9nZbhPVswKLJpjX0I77eQhCGEEPfo\n201nMZoUU4aG4ObiyJcJJ/lh/yUAOrb0YsLAtvg3crFxlFVPEoYQQtyDI2dzOHr+Cu2bNyK8rS8a\njYb/NyWSH/ZfIsDLhW7tfK3yetSaQBKGEEJYyGA08e2mM2g1Gsb3b2NODE6OdjzYs4Vtg6sG8j4M\nIYSwgMFoYvm28+iuFhPbNcgmLzCyNelhCCFEJZRSHDqTw9Kt59DlXsfD1ZGH+rS0dVg2IQlDCCF+\n5WpBCeczrnE+I58TF69yMasArUZDv65BPNS7JQ2d686dT/dCEoYQQvys1GDi281n2HIw3TxNA3Rt\n40N8v+A7vja1vpCEIYSol86k5eHsaE9jX1e0Gg05ecXMW3mMlKwCGvu4Et3Rn1aB7rQIdK/V4z9V\nJWkFIUS9s+NIBv9elwxAQ2cH2jTx4PSlPIpuGOjVKYAJg9rh9KvhPoQkDCFEPXMq9Sr/2XAK1wb2\ndGntw6nUqxw6k4O9nZbJQ0Lo0zmwzj5Hcb8kYQgh6g3d1et8svwoAM+PCiWkeSOUUuRcu4GTgx3u\nrrV/vCdrkoQhhKgXim6U8tHSJIpuGJg8JISQ5o2Am+/T9vV0tnF0tYM8uCeEqPOMJhOfrTxGVu51\nBkc2o2+X2v1eCluRhCGEqPOWbjnH8ZSrdA72Jr5fsK3DqbUkYQgh6rQf915k475LBHq78MyIjrX+\nJUa2JAlDCFGrXS0oYfPBNIpulN4270xaHv9aloRrA3teiu8sz1PcJ6u23qxZs9i6dSve3t6sXr36\ntvmrV6/m888/R6PR4OLiwl/+8hfatWtnzZCEEHVIid7I+0sOk55dxIrt5xkW3YK4bkHorhazds9F\nEk/o0Gg0TBvZqU6+n6K6aZRSylqV79+/H1dXV2bMmFFhwjh8+DDBwcG4ubmxfft2PvnkE5YsWWJR\n3dnZBVUdbq3k6+smbfEzaYsy9aEtlFIsWH2CxBM62jdvxMWsAq6XGGjo7EBh8c3eRhNfV6aM6EQL\n3/o9pMcvfH3d7mt5q/YwIiIiSE9Pv+P8sLCwcv/X6XTWDEcIUYf8eCCNxBM6goPceeWRLpSUGkn4\n6SJbDqUTHOTOsOgWdAn2xs/Pvc4nz+pSY07oLV26lL59+9o6DCFELXD6Uh5LNp/F3cWB50aGYm+n\nxd5OyyOxrXkktrWtw6uzakTC2LNnD8uXL+e///2vxcvcb9eqLpG2KCNtUaautsW5tDzmrTyGAv44\nOZK2rXzuukxdbYvqZvOEkZyczJ///Ge++OILPDw8LF5Oupg31Ydz1ZaStihTV9vibPo1PlhyhBsl\nBh4fEkKAu9Ndt7OutsVvUaOvYcDNC1N3kpGRwUsvvcTf//53mjVrZu1QhBC1WPLFq3y0LIlSg4mn\nhncgumOArUOqd6yaMF599VUSExPJy8ujX79+vPjii5SWlqLRaBg7dizz5s3j2rVr/L//9/9QSmFv\nb8+yZcusGZIQopZJyy7kx/1p7D6WiVLw7MhOdGvna+uw6iWr3lZrTdLFvEm622WkLcrU9rYwmkwk\nnb3CjwfSOHnxKgA+Hg2YNLgdnVp631Ndtb0tqlKNPyUlhBCWulpQwq6jmWw9nE5ufgkA7Zs3on+3\nJnRp7SPDetiYJAwhhM2YTIp9yZc5kZLL6Ut56K4WA+DkYEds1yBiuwbRxK+hjaMUv5CEIYSwCX2p\nkc9Xn+DA6WwAGjjaEdrKm87B3vTsFCDjPtVA8o0IIapdwXU9//zuKGfTrxHSzJNHHmhNMz83OeVU\nw0nCEEJUq8t5xXyw5Ai63Ov06ODPE0Pb42AvA2fXBpIwhBDVJj2niH98e4hrhXqG9mjO6JhWaDXS\nq6gtJGEIIapFqq6Af3x7mMLiUsY90JqBkfKwbm0jCUMIYTUmk6KguJSLWQUs+P44xSUGHh/cjpiw\nIFuHJn4DSRhCiCp3IiWX/2w4RU7eDUw/Pxus1WhkSI9aThKGEKJKHTiVzfzvjwHQqrE7Hg0d8XR1\nIrydL+2bN7JxdOJ+SMIQQlRqx5EMDp3JoW9YYzoHe1d6kXrHkQwWrU/G0d6OF8eE0qGFVzVGKqxN\nEoYQ4o5+EQoNAAAgAElEQVRKDSaWbj1HYXEph8/mEOjtQv9uTWjo4oi+1Ii+1Ej+9VKuFtzgyrUb\nHE+5imsDe155JIxWjd1tHb6oYpIwhBB3dOhMNoXFpUS298PeTkviCR2LN56+Y3n/Rs68MDqUIF8Z\nzqMukoQhhLijHUmZAIzo1ZLGPq6M7tuKg6ez0Wg0ONprcXDQ4ubiiJebE54NnWQ4jzpOvl0hRIVy\n8oo5cSGX1kEeNPZxBcDLvQH9I5raODJhK/I8vhCiQjuPZqKAPl0CbR2KqCEkYQghbmMyKXYezaSB\nox3dQ/xsHY6oISRhCCFuczwll9z8EiLb+9PAUc5ci5skYQghbrP9SAYAfbs0tnEkoiaxasKYNWsW\nPXv2ZPjw4Xcs89ZbbzFw4EAeeughTp48ac1whBB3kXmliM9WHePgqWyCfF1pGXh/74AWdYtV+5qj\nR49m4sSJzJgxo8L527ZtIzU1lY0bN3LkyBHefPNNlixZYs2QhKi3SvRGPlt1DEcHO8Ja+xAa7I1r\nA3uuXLtBSlYBh87ksOdEFkpBc383Jg8JQSNDj4tbWDVhREREkJ6efsf5mzZtYuTIkQB06dKFgoIC\ncnJy8PHxsWZYQtRLq3encOTcFQD2JV9Gq9Hg7GRH0Q2DuUwTX1ce6t2K8LY+kizEbWx6Nevy5csE\nBJSNXOnv749Op5OEIUQVy7xSxIa9qXi7N+D50Z04fiGXw2dzKLheSocWXrQIcKNloDttm3nKC43E\nHdk0Yaifhz2+laVHNb6+cm71F9IWZaQtyvzSFkopPvouCaNJMW1MZ7p3CqR7aP16H4XsF1XDpgnD\n39+frKws8+esrCz8/Cy75zs7u8BaYdUqvr5u0hY/k7Yoc2tb7D2p48iZHDoHe9PKz7XetZHsF2Xu\nN3Fa/bbainoRv4iLi2PlypUAHD58GHd3dzkdJUQVKiwu5dtNZ7C30/Jo/zZyXULcF6v2MF599VUS\nExPJy8ujX79+vPjii5SWlqLRaBg7diwxMTFs27aNAQMG4OzszDvvvGPNcISos0r0Rq6XGCg1mjAY\nTBy9mMeWfakcu3AFg1ExolcL/Bq52DpMUctpVGVdgBpMupg3SXe7TH1rC6UUpy/lseVQOgdOZWM0\n3f5TbuLrSlQHfwZFNsPern4+p1vf9ovK3O8pKXnmX4ha6PSlPBZvOEV6ThEAgd4uNPN3w95Og4Od\nlqaBHoQ0cSfQ29XGkYq6RBKGELXMoTPZfLryOCaTIrK9H7Fdg2jb1LPc9Qk5qhbWIAlDiFpk19FM\n/r02GXt7DS+N6UynVt62DknUI5IwhKihDEYTmw+kkX3tBiaT4nqJgcQTOlwb2PO7h7sQHORh6xBF\nPSMJQ4ga6rtt59iw91K5aY3cnHjlkS40kXdmCxuQhCFEDXToTDYb9l7C38uFaSM6Ym+vxU6rwdvd\nCQd7O1uHJ+qpuyaMS5cusWzZMhITE8nKysLJyYmQkBAGDRrEwIEDsbeXnCNEVcrJK2bhmpM42Gt5\nbmQnmvpJb0LUDJX+tf/zn//M8ePHGTx4ML///e/x8fGhpKSEc+fOsXPnThYsWMBf/vIXwsLCqite\nIeo0g9HEp6uOc73EwOQhIZIsRI1SacKIi4tj9uzZt01v164dQ4cOJS8vj0uXLlWwpBDiXhmMJhYm\nnORCZj7RHQPo0znQ1iEJUU6lCSMmJqbShT09PfH09KzSgISoj0oNRj5deZzDZ3MIDnJn4qC2Mu6T\nqHEsGivgr3/9KwUFBRgMBh599FHCwsJYtWqVtWMTol64oTfw4dIkDp/NoUOLRvx+bFcaOMq1QVHz\nWJQwdu/ejZubGzt37sTf358NGzbw5ZdfWjs2Ieq8VF0Bf/3qICcvXqVrGx9eju+Mk6PcBSVqpns6\njNm3bx8DBgzA399fustC3IdSg5Hvd6Wwbk8qJqWICWvMhIFtsdPWzwECRe1gUcLw9vbm9ddfZ9eu\nXUydOhWDwYDRaLR2bELUOfpSI3tO6Fi35yK6q8V4uzfg8cHtZIgPUStYlDDee+89vv/+e+Lj4/Hw\n8CAtLY0nnnjC2rEJUWcUFpeycV8qWw9lUFhcilajoX9EE0b3bSXXK0StYdGe6uXlxeTJk82fmzRp\nQpMmTawVkxA12rXCErRaDW4ujhaV1129zgdLjnD5ajGuDewZFt2c2K5BeLk3sHKkQlStShPGc889\nx7Rp0+jcufNt8woLC/nuu+9o0KABY8eOtVqAQtQkJXojbyzcyw29gZ6dAhkU2bTSd06cS7/GR8uS\nKCwuZUiPZozo1RInB7moLWqnShPGSy+9xHvvvUdKSgqdO3fG29ubkpISzp8/T3p6OuPGjWP8+PHV\nFasQNrc3WUdhcSmODlq2H8lgx5EMQoO96djSi3ZNPWni2xC9wYgut5jzGdf43+azlBpNTBrcjn5h\nQbYOX4j7UmnCCAkJ4fPPPyczM5O9e/ei0+lwcnJi8ODBdOvWDUdHy7rkQtQVO45kogFmPxlFalYB\n6xIvknTuCknnrgDgaK9FbzCZyzs6aHlpTGe6tPaxUcRCVB2LrmEEBgby0EMP/aYVbN++nblz56KU\nYsyYMUydOrXc/MzMTP7whz9QUFCAyWRi+vTpd33CXAhbSM8p4mz6NTq29MLP0xk/T2e6tfMl+9oN\nzlzK49SlPFIyC3B3dcDfy4WARi6EBnsT4OVi69CFqBIWJYwrV67wzjvvkJmZyddff01ycjKHDh26\n6+kok8nEnDlzWLRoEX5+fsTHxxMXF0dwcLC5zKeffsrQoUMZN24c586d4+mnn2bz5s33t1VCWMGO\nIxkA9O3S2DxNo9GYk0evUBn7SdRtFj0l9Prrr9OtWzfy8/MBaNWqFf/973/vulxSUhLNmzcnKCgI\nBwcHhg0bxqZNm8qV0Wg0FBYWApCfn4+/v/+9boMQVldqMLH7WBYNnR3o2kZOL4n6yaKEodPpGD9+\nPHZ2N+/ucHR0RGvBE6k6nY7AwLKjLn9/fy5fvlyuzAsvvMCqVauIiYlh2rRpvPHGG/cSvxDV4vDZ\nHAqLS+kVGoC9nTyNLeoni05J/folSfn5+Sil7rqcJWUSEhIYM2YMkydP5vDhw7z22mskJCTcdTlf\nX7e7lqkvpC3KWKst9iw/CsBD/drUmvauLXFWB2mLqmFRwhg4cCB//vOfKSoqYvny5fz3v/9lzJgx\nd10uICCAjIwM82edToefn1+5MsuWLWPhwoUAhIWFUVJSQm5uLl5eXpXWnZ1dYEnodZ6vr5u0xc+s\n1RapugIOn86mdRMPGmhrx74n+0UZaYsy95s4LepbP/XUU0RERNCxY0e2bdvGxIkTefzxx++6XGho\nKKmpqaSnp6PX60lISCAuLq5cmcaNG7N7924Azp07h16vv2uyEKI6KKXYcjCNtxcfQAEDIpraOiQh\nbEqjLDlvdB+2b9/O22+/jVKK+Ph4pk6dyscff0xoaCixsbGcO3eO119/nevXr6PVapkxYwbR0dF3\nrVeOGG6So6cyVdkW+UV6Fq1L5vDZHFwb2DN5SHu6tfOtkrqrg+wXZaQtytxvD8OihHHlyhW++uor\nUlNTMRgM5ukfffTRfa38fsgOcJP8GMpURVvkX9ezYW8qmw+kU1JqJKSZJ08P70gjN6cqirJ6yH5R\nRtqizP0mDIuuYTz33HN06NCB6Oho851SQtQl6dmF7EjKZNvhDEpKjXg0dCS+XzCxXYPQauXdL0KA\nhQmjuLiYN99809qxCFGtSkqNbD+cwe5jWVzU3TwC9WjoyJiYVsSENcbBXg6OhLiVRQmjS5cunDp1\ninbt2lk7HiGqRfLFqyxal8zlvGLstBq6BHvTMzSQsNbekiiEuAOLEsa4ceOYMGECAQEBODmVnctd\ntmyZ1QIToqqUGkwUXNejN5go0RvZejidbYcz0GhgUGRThkQ1x91VBtIU4m4sShivvfYa06ZNo0OH\nDnINQ9Qql/OK+etXB8gr1Jeb3sTXlSeGtqdloLuNIhOi9rEoYTg5OfHkk09aOxYhqpS+1Mi85UfJ\nK9TTtY0Pbi4OONrb4e/lQkxYYxniQ4h7ZFHC6NOnD9u3b6dv377WjkeIKqGUYvHGU6ReLqRvl8ZM\nHhJi65CEqPUsShhLlixhwYIFuLq64ujoiFIKjUbDTz/9ZO34hPhNth/JYNfRLJoHuPHYgDa2DkeI\nOsGihPHdd99ZOw4hqkROXjE/ndCxetcFXBvY8/yoTnLXkxBVxKKEERQk7yIWNVvSuRx+WHKE4+d/\nflWqg5ZnHuqIj4ezjSMTou6oNGG89tprvPvuu4wZMwaN5vanXeW2WmFrJqX4fucFvt+VAkBIM0+i\nOwbQrZ0fLg0sOh4SQlio0l/ULy87+sMf/lAtwQhxL4pLDCxMOMnB09n4eDTgjSd74OYodz4JYS2V\nJoxfXskaGRlZLcEIYYkSvZHEkzrWJ6aSlXudkGaePDuyE62CPGSQOSGsSPrsotbIKywh4aeL7D6W\nSXGJEY0G+ndrwiMPtJZnKoSoBpUmjNOnT1f4bgq5rVZUtwOnsvm/9ckUFpfi2dCRARFN6dulMV7u\nDWwdmhD1RqUJo0WLFixYsKC6YhHiNsUlBr758Qw7j2biYK/lsQFt5SltIWyk0oTh6Ogot9QKmzAY\nTWw7nMGa3SlcK9LT3N+NqSM6EOjtauvQhKi3Kk0YDg4O1RWHEMDN22R/OpbFqp0XyLl2AycHOx7q\n3ZJh0c2lVyGEjVWaMJYsWVJdcQhBenYh/7fhFGfTrmFvp2Vg96YM7SFDjwtRU1j9Lqnt27czd+5c\nlFKMGTOGqVOn3lZm7dq1/Otf/0Kr1dKuXTv+8Y9/WDssUYPkF+n5Yf8l1iemYjQpurXzZdwDbfD2\nkAvaQtQkVk0YJpOJOXPmsGjRIvz8/IiPjycuLo7g4GBzmYsXL/LFF1/wv//9j4YNG5Kbm2vNkISN\nGIwmVu9KQW8w4uxkj4uTPVfyb3Ai5SqXLhcC4O3uxGMD2hHWxsfG0QohKmLVhJGUlETz5s3NF86H\nDRvGpk2byiWMJUuW8Oijj9KwYUMAvLy8rBmSsJENe1NZvTvltun2dlo6tGhEp5be9OvamAaO8miQ\nEDWVVX+dOp2OwMBA82d/f3+OHj1arkxKSgoA48ePRynF888/T58+fawZlqhmV67dYPWuFNxdHHhh\ndGdKDEau3zDg2sCe1kEeODrIaLJC1AZWTRhKqbuWMRqNpKam8vXXX5ORkcFjjz1GQkKCucdxJ76+\nblUVZq1X09vi84ST6A0mnn+4C9Fdm1h1XTW9LaqTtEUZaYuqYdWEERAQQEZGhvmzTqfDz8+vXBl/\nf3+6du2KVqulSZMmtGzZkpSUFDp16lRp3TJm0E2+vm41ui2Onr/CT0czadPEg07NPK0aa01vi+ok\nbVFG2qLM/SZOq97YHhoaSmpqKunp6ej1ehISEoiLiytXpn///uzZsweA3NxcLl68SNOmTa0Zlqgm\npQYTX/9wGq1Gw4SB7SocIl8IUXtYtYdhZ2fHG2+8wZQpU1BKER8fT3BwMB9//DGhoaHExsbSp08f\ndu3axbBhw7Czs2PGjBl4eHhYMyxhRUopUnWFHDydzcHT2Vy+Wkz/iCY09av8FKMQoubTKEsuNNRA\n0sW8qSZ1t3OuFfOvFce4mHUzHns7LWGtvXliaHucnax/91NNagtbk7YoI21R5n5PSck9jKJKnEnL\n41/Lj5J/vZSubXyI7hhAp1ZecpusEHWI/JrFfTEpxc6kTBZvOIVS8NiAtjwQHiTXK4SogyRhiHtm\nMilOpl7l4Kmb1ymuFelxcbLn2VGd6NhCHrwUoq6ShCHuSeaVIr5Yc5ILmTdf39vQ2YHenQMZFt0c\n/0YuNo5OCGFNkjCERUxKselAGsu2nqPUYKJ7iB+xXYNo09QDO60MOy5EfSAJQ9xVzrVivkw4SXJq\nHg2dHXj6wQ5EhPjdfUEhRJ0iCUPckVKKHUmZfLvpDDf0RsJa+/D44HZ4NHSydWhCCBuQhFHPGYwm\ntBoNWm35u5oycopYsuUsSeeu4Oxkx5PD2tOzU4Dc/SREPSYJox7LL9Lz1n/2U1JqJCLEj6j2/rg2\nsGf17hT2nbyMAjq2aMQTQ9vj5S4vMxKivpOEUU+ZlOLzNSduvjfb0Y4tB9PZcjDdPL+ZX0OG92pJ\neFsf6VUIIQBJGPXW2p8ucvxCLqGtvHlxTCinUvNIPKEjr6iE2K5BhLWWRCGEKE8SRj10+lIeK3ac\np5GbE0892B57Oy0dW3rRsaU8dCeEuDNJGPWIUoqz6df4bNUxNGh4ZkRH3FwcbR2WEKKWkIRRD5Qa\njOw/lc0P+y6R8vNIsg/3C6ZtU08bRyaEqE0kYdRRpy/lcfB0NmfTr3ExqwCjSaEBurbxYWD3prRr\n1sjWIQohahlJGHXQ0fNX+HDJERRgp9XQzL8hIc0aEdM1CD9PZ1uHJ4SopSRh1DGX84pZ8P1x7Oy0\nPPtQRzq09MLJwc7WYQkh6gBJGHVIid7IJ98dpeiGgSeGhNC1ra+tQxJC1CEyzGgdoZRi0fpk0rIL\n6dc1iD5dGts6JCFEHWP1hLF9+3YGDx7MoEGDWLBgwR3LrV+/npCQEI4fP27tkOqU4hIDWw+n8+aX\n+0g8oSO4sTuP9m9j67CEEHWQVU9JmUwm5syZw6JFi/Dz8yM+Pp64uDiCg4PLlSsqKuKrr74iLCzM\nmuHUKSaTImHPRTbsTeX6DQN2Wg0RIX481r8N9nbScRRCVD2rJoykpCSaN29OUFAQAMOGDWPTpk23\nJYyPPvqIp59+mi+++MKa4dQZeYUlLPj+OMmpeXi6OdG/WxNiwoJo5CbDjgshrMeqh6I6nY7AwEDz\nZ39/fy5fvlyuzMmTJ8nKyiImJsaaodQZxy5c4S9f7iU5NY+w1j7Mm/EAI/u0kmQhhLA6q/YwlFJ3\nnT937lz+9re/WbzML3x93e4rttomJTOfxWtPsvdEFvZ2Gp5+qBPD+7RCo9HI8B63qG/7RWWkLcpI\nW1QNqyaMgIAAMjIyzJ91Oh1+fmWv9iwqKuLs2bNMnDgRpRQ5OTk899xzfPrpp3Ts2LHSurOzC6wW\nd02Sqitgw95U9hzXoYC2TTwY178NLQLcyckpxNfXrd60xd1IW5SRtigjbVHmfhOnVRNGaGgoqamp\npKen4+vrS0JCAu+//755fsOGDfnpp5/MnydOnMjMmTPp0KGDNcOq8fSlRvaevMzWw+mcz8gHbr6f\nYnRMMKGtvGTYcSGETVg1YdjZ2fHGG28wZcoUlFLEx8cTHBzMxx9/TGhoKLGxseXKazQai09J1UUm\nk2LXsUxWbD9PXqEeDdA52JuYsMZ0ae2DVhKFEMKGNKqW/oWua13Mkym5fLv5LJcuF+JoryUuogmx\nXYPw8ah87CfpbpeRtigjbVFG2qJMjT4lJe5OX2pkyZazbD6Yjgbo1SmAUX1byTu0hRA1jiQMG0q7\nXMj874+TnlNEkI8rTz7YnhYB7rYOSwghKiQJwwZMSrHpQBpLt5zDYDTxQHgQj8S2xlFGlRVC1GCS\nMKpZbv4Nvlx7khMpV2no7MATQzvStY2MKiuEqPkkYVQTfamR3ceyWLb1HNdLDHQO9uaJISF4NJQn\ntIUQtYMkDCvLyStm86F0dhzJoOiGAUcHLZMGtyOmS2N5nkIIUatIwrASk0mxLvEiK3dcwGhSuLk4\n8GDP5vQLC5I7oIQQtZIkDCu4cu0GX6w5walLeXg0dCQ+JpjI9n442MtFbSFE7SUJowoppdhzQsdX\nG09TXGIgvK0vjw9uJ4MDCiHqBEkYVSS/SM9/Npzi4OlsnBzsmDwkhD6dA+U6hRCizpCEcZ+UUuxL\nvsxXG09TWFxK26aeTBnWHj/Pyof0EEKI2kYSxn1Iu1zIf388TXJqHg72WsbFtaF/RBMZJFAIUSdJ\nwvgNCotLWbnjPFsOpaMUdAn2ZlxcG/y9XGwdmhBCWI0kjHtgMim2HU5n+fbzFN0wEODlwvj+bQht\n5W3r0IQQwuokYVjoVOpV/vvjGS5dLqSBox2PxLamf0QT7O2s+lp0IYSoMSRh3EVu/g2WbDnL3pOX\nAegdGsiYmFYypIcQot6RhHEHJXojG/elkrDnIvpSEy0D3Xh0QFuCG3vYOjQhhLAJSRi/YjCa2JmU\nyaqdF7hWpMfNxYHH+relV+dAuftJCFGvWT1hbN++nblz56KUYsyYMUydOrXc/EWLFrF06VLs7e3x\n8vJi7ty5BAYGWjus2xiMJhJP6Fjz00V0uddxdNDyYM8WDI5shksDyatCCGHVv4Qmk4k5c+awaNEi\n/Pz8iI+PJy4ujuDgYHOZDh06sHz5cpycnPjmm2/4+9//zgcffGDNsMopNZjYmZTBusRUcq7dwE6r\noV/XIEb0aoGnXKcQQggzqyaMpKQkmjdvTlBQEADDhg1j06ZN5RJGZGSk+f9hYWGsXr3amiGVk3ml\niM9WHefS5UIc7LXEhTdhcFQzvD1kNFkhhPg1qyYMnU5X7vSSv78/R48evWP5ZcuW0bdvX2uGZLbr\naCZfbTxNSamRPp0DGd1X7nwSQojKWDVhKKUsLrtq1SqOHz/O4sWLqzyOkxev8p/1yZQaTTg72qPR\naEjLLsTZyY5pD3Uksr1/la9TCCHqGqsmjICAADIyMsyfdTodfn5+t5XbvXs3CxYs4KuvvsLBwcGi\nun193Swqd+R0Nh8tS8JkMuHl4Uz+dT3Xbxjo2Mqb343rSoC3q2UbU4NZ2hb1gbRFGWmLMtIWVcOq\nCSM0NJTU1FTS09Px9fUlISGB999/v1yZEydO8Oabb7Jw4UIaNWpkcd3Z2QV3LXPswhX++d1RlFK8\nMDqUzsE+wM2ej0ajAZPJonpqMl9ft1q/DVVF2qKMtEUZaYsy95s4rZow7OzseOONN5gyZQpKKeLj\n4wkODubjjz8mNDSU2NhY3n33XYqLi3n55ZdRStG4cWPmzZt33+s+fCaHeSuPAfDimM7lxnuSd1QI\nIcS906h7udBQg2RnF1BSamRnUiYdWjQi8OdTS0op1iemsmzrOezttbw4JpROLevu4IBy9FRG2qKM\ntEUZaYsyNbqHYW3fbTvHj/vT0AARIX4MimzGjwcusee4jkZuTrwwOpSWge62DlMIIeqEWpswUnUF\nbDqQho9HA1ydHdiXfJl9yTcHCAxu7M7zo0PlwTshhKhCtTJhmEyKxRtPoRRMGtyOji28OHYhlw17\nU/Fv5MK4uDY42Muw40IIUZVqZcL4cV8q59LziQjxM1+fCG3lLS8yEkIIK6qVh+GL1pzAydGO8XFt\nbB2KEELUG7UyYRRc1zOyd0sauck1CiGEqC61MmE82Lslcd2a2DoMIYSoV2plwnhmVGd5l7YQQlQz\n+asrhBDCIpIwhBBCWEQShhBCCItIwhBCCGERSRhCCCEsIglDCCGERSRhCCGEsIgkDCGEEBaRhCGE\nEMIikjCEEEJYRBKGEEIIi1g9YWzfvp3BgwczaNAgFixYcNt8vV7PK6+8wsCBAxk7diwZGRnWDkkI\nIcRvYNWEYTKZmDNnDgsXLmTNmjUkJCRw7ty5cmWWLVuGh4cHGzdu5PHHH+fdd9+1ZkhCCCF+I6sm\njKSkJJo3b05QUBAODg4MGzaMTZs2lSuzadMmRo0aBcCgQYP46aefrBmSEEKI38iqCUOn0xEYGGj+\n7O/vz+XLl8uVuXz5MgEBAQDY2dnh7u5OXl6eNcMSQgjxG1g1YSil7rmMUgqNRmOtkIQQQvxG9tas\nPCAgoNxFbJ1Oh5+f321lsrKy8Pf3x2g0UlhYiIeHx13r9vV1q/J4aytpizLSFmWkLcpIW1QNq/Yw\nQkNDSU1NJT09Hb1eT0JCAnFxceXKxMbGsmLFCgDWr19Pjx49rBmSEEKI30ijLDlvdB+2b9/O22+/\njVKK+Ph4pk6dyscff0xoaCixsbHo9Xpee+01Tp48iaenJ++//z5Nmsj7uoUQoqaxesIQQghRN8iT\n3kIIISwiCUMIIYRFJGEIIYSwSK1LGHcbm6ouy8rKYtKkSQwdOpThw4fzn//8B4Br164xZcoUBg0a\nxJNPPklBQYGNI60eJpOJUaNGMW3aNADS0tJ45JFHGDRoENOnT8dgMNg4wupTUFDASy+9xJAhQxg2\nbBhHjhypl/vFokWLePDBBxk+fDivvvoqer2+Xu0Xs2bNomfPngwfPtw8rbL94K233mLgwIE89NBD\nnDx58q7116qEYcnYVHWZnZ0dM2fOZO3atXz77bd8/fXXnDt3jgULFhAdHc2GDRuIiopi/vz5tg61\nWvznP/8hODjY/Pkf//gHTzzxBBs2bMDNzY1ly5bZMLrq9fbbbxMTE8O6detYtWoVrVq1qnf7hU6n\nY/HixSxfvpzVq1djNBpJSEioV/vF6NGjWbhwYblpd9oPtm3bRmpqKhs3bmT27Nm8+eabd62/ViUM\nS8amqst8fX1p3749AK6urgQHB6PT6cqNxzVq1Ch+/PFHW4ZZLbKysti2bRsPP/ywedqePXsYNGgQ\ncLMdfvjhB1uFV60KCwvZv38/Y8aMAcDe3h43N7d6uV+YTCaKi4sxGAzcuHEDPz8/EhMT681+ERER\ngbu7e7lpv94PfvmbuWnTJkaOHAlAly5dKCgoICcnp9L6a1XCsGRsqvoiLS2N5ORkunTpwpUrV/Dx\n8QFuJpWrV6/aODrrmzt3LjNmzDAPI3P16lU8PDzQam/u0gEBAfVm30hLS6NRo0bMnDmTUaNG8cYb\nb1BcXFzv9gt/f3+eeOIJ+vXrR9++fXFzc6NDhw64u7vXy/3iF7m5ueX2g9zcXKD8OH5ws/10Ol2l\nddWqhCGPjNxUVFTESy+9xKxZs3B1da13Y29t3boVHx8f2rdvb94nlFK37R/1pV0MBgMnTpzg0Ucf\nZcWKFTg7O7NgwYJ6s/2/yM/PZ9OmTWzZsoUdO3ZQXFzM9u3bbytX39rlTir6e3q3trHqWFJVzZKx\nqcpAbb4AAAdwSURBVOo6g8HASy+9xEMPPUT//v0B8Pb2JicnBx8fH7Kzs/Hy8rJxlNZ18OBBNm/e\nzLZt2ygpKaGoqIi5c+dSUFCAyWRCq9WSlZVVb/aNgIAAAgICCA0NBWDgwIF8/vnn9W6/2L17N02b\nNsXT0xOA/v37c+jQIfLz8+vlfvGLO+0H/v7+ZGVlmctZ0ja1qodhydhUdd2sWbNo3bo1jz/+uHna\nAw88wPLlywFYsWJFnW+T6dOns3XrVjZt2sT7779PVFQU//jHP4iKimL9+vVA/WiHX/j4+BAYGMiF\nCxeAm9dyWrduXe/2i8aNG3PkyBFKSkpQSrFnzx7atGlT7/aLX/cc7rQfxMXFsXLlSgAOHz6Mu7u7\n+dTVndS6oUEqGpuqvjhw4AATJkygbdu2aDQaNBoNr7zyCp07d+Z3v/sdmZmZNG7cmI8++ui2C191\n1d69e/nyyy/57LPPuHTpEtOnTyc/P5/27dvz7rvv4uDgYOsQq0VycjJ/+tOfMBgMNG3alHfeeQej\n0Vjv9otPPvmEhIQE7O3t6dChA2+99RZZWVn1Zr949dVXSUxMJC8vDx8fH1588UX69+/Pyy+/XOF+\nMHv2bHbs2IGzszPvvPMOHTt2rLT+WpcwhBBC2EatOiUlhBDCdiRhCCGEsIgkDCGEEBaRhCGEEMIi\nkjCEEEJYRBKGEEIIi0jCEDXaAw88wNmzZ6tlXZ988km5oa9nzpzJ119/fd/1zpw5k+HDhzN9+vT7\nrqsyycnJrFu3zqrrEPWbJAwhfvbJJ59QWlpapXXm5OTw/9u7v5AmuziA49/ln7S8KOvWoghaI8KL\nihkJWon0R/Y8S2NYOL1IEFqE3gjRRZZEBcPyJqE/lDSIyBp2UV4IEVgGXeyiDKMVFnSRltTmaPr4\ney/Eh3KL9vYG7+vb73O182znnN/DYL+dHfY7fX199Pb2EgwGf+vYcz1//vyXE8b09PRvjkb9H2nC\nUPPS69evOXjwIDU1NRiGYZc+AHA6nXR1dVFdXU1FRQV9fX32c/fv32fnzp14vV66urpwOp0kEgna\n2tpwOBz4fD5M0yQWiwEwPDyM3++nsrKS1tbWH8Zz584dqqqq8Hg8BAIBPn78SDwex+/38/XrV0zT\n5OrVq9/1CYfDHDp0yG5blkVpaaldL+3ixYvs27cPr9dLU1MTY2NjAExOTnL69GmqqqowDINAIMD4\n+DidnZ08fvwY0zRpb28HZiojmKaJx+OhoaGBt2/fAjP/kDcMg5MnT+Lz+Xj48OE/eTvUn0KU+g8r\nLy+Xly9ffndtampKTNOUaDQqIiKxWEwqKyvt9tq1a+X69esiIvL06VMpLS0VEZHR0VHZvHmzjIyM\niIjIlStXxOl0ysTEhN0vkUjY87S2tkptba0kk0lJJpOye/duGRgYSIlxeHhYtm7dKqOjoyIi0tHR\nIUeOHBERkXfv3onb7U57b4lEQtxut3z69ElERPr7+8Xv94uISDgclmPHjtmvDYVC0tLSIiIinZ2d\nEggEZGpqSkTE7t/T0yOHDx+2+4yNjYnb7ZZXr16JiMjNmzelpqZGREQGBwfF5XJJJBJJG5tS6egK\nQ807b968IRqN0tzcjGEY7N+/n8nJye9OX9y1axcAxcXFfPjwgWQySSQSYf369RQVFQFQXV2dMrbM\nqZSzY8cOcnJyyMnJweVyMTIyktJncHCQsrIyli1bBoDP52NgYOCn95GXl8f27du5e/cuMFMYbvYQ\npP7+fh49eoRhGBiGQSgU4v3798BMefe6ujqysrIA7Oqsc0UiEdatW8fq1asB2Lt3L0NDQ0xMTACw\ncuVKNmzY8NM4lZo1r8qbKwUzH+qFhYXcvn077fMOh4OFCxcC2AfnWJaVkgzmttPJzc21H2dlZaU9\nD1pEUs4RmJ33ZwzD4NSpU+zZs4cnT55w9uxZe8ympia8Xm/a+TKRLq5v24sWLcpoHKVm6QpDzTur\nVq0iLy+PcDhsX4tGo8TjcSD1A3W2XVxczLNnz+zf8b/d9wAoKCjgy5cvfzuekpISHjx4YO8x3Lhx\ngy1btqTMn87GjRuJxWIEg0EqKirsRLdt2zZCoRCfP38GIJlM8uLFCwDKy8u5du2avUE/e5JeQUGB\nvfcye79DQ0N22fOenh5cLpcmCvXLdIWh/tMcDgf19fVkZ2fb35h7e3u5cOEC7e3tXL58GcuyWL58\nOR0dHXafuWPAzEEyx48fp7GxkaVLl1JWVkZ2djb5+fkANDQ0UFdXR35+Pt3d3RnHuGbNGpqbm6mv\nr2fBggUUFRXR1taWMv+PGIbB+fPnCYVC9jWPx8P4+DgHDhzA4XAwPT1NbW0tTqeTxsZGgsEghmGQ\nm5vLihUrOHfuHCUlJVy6dAnDMNi0aRNHjx7lzJkztLS0YFkWhYWF9gpGqV+h5c3VHyUej7N48WJg\n5hv3rVu3fst/LZT6E+gKQ/1Ruru7uXfvHpZlsWTJEk6cOPFvh6TUvKErDKWUUhnRTW+llFIZ0YSh\nlFIqI5owlFJKZUQThlJKqYxowlBKKZURTRhKKaUy8hf8CwfjbzhfpQAAAABJRU5ErkJggg==\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b218dbd0\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(ag_means)\n",
+        "plt.ylabel('Time(s)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (AutoGraph)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "d7IAJ6Bwbk9t"
+      },
+      "source": [
+        "## Eager"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "XMu5-12yoOzY"
+      },
+      "outputs": [],
+      "source": [
+        "from tensorflow.python.eager import context"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "_vt9MzpyjQ4T"
+      },
+      "outputs": [],
+      "source": [
+        "# Sum written using for loop and run with tf.eager\n",
+        "def sum_all(elements):\n",
+        "  sum_ = 0.0\n",
+        "  length = elements.shape[0]\n",
+        "  for i in tf.range(length): \n",
+        "    sum_ += elements[i][0]\n",
+        "  return sum_\n",
+        "\n",
+        "eager_means = []\n",
+        "for num in range(max_elements):\n",
+        "  with context.eager_mode():\n",
+        "    durations = []\n",
+        "    for i in range(trials + burn_ins):\n",
+        "      \n",
+        "      start = time.time()\n",
+        "      for _ in range(batches):\n",
+        "        run_trial(num)\n",
+        "      \n",
+        "      if i \u003c burn_ins:\n",
+        "        continue\n",
+        "      \n",
+        "      duration = time.time() - start\n",
+        "      durations.append(duration)\n",
+        "    eager_means.append(np.mean(durations))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 301
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 422,
+          "status": "ok",
+          "timestamp": 1532460024499,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 240
+        },
+        "id": "5gHVdMlD-A-T",
+        "outputId": "3b581cb7-7ef9-489c-92f1-3e52c0c2dc8a"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEcCAYAAAAydkhNAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3XlclNX+wPHPsC+CILviCiruhAiiqaAimqnglpZLWpkt\ntmh50+rXvZZ2q9veq5ulZYtlaZp7mQuaorjklkoqiwgKsojsy8yc3x9cBxFUVGBYvu+/mGfmPM93\nzjzMd85zznOORimlEEIIISphYuwAhBBC1F2SJIQQQtyQJAkhhBA3JElCCCHEDUmSEEIIcUOSJIQQ\nQtyQJAlg8eLFvPrqq8YOo14ZOHAge/furfHjzJs3jw8//LDGj1NXxMfHExERQc+ePfnuu++MHU6D\ndvbsWcaMGWPsMCo1btw4YmNjjR0G0EiSxD333IOfnx9+fn506tSJHj16GLZt2LCBxx9/nNdff73G\n40hOTsbHxwe9Xl/jx6pOje2L+lZq8nNcsmQJgYGBHDp0iEmTJlV4fvPmzUyYMAFfX1+mTJlS4flT\np04xevRofH19GTNmDDExMeWef+eddwgMDKR379688847t1W2pk2ePJlVq1bV2vE++ugjHn30UcPj\ngQMH0qNHD/z8/AzfD2+88UatxXOtRx55pM78zzWKJHH48GH+/PNP/vzzT5o3b87ixYsN2+6///5a\ni0MphUajQe5frN9q8nO8cOEC3t7eN3zewcGBhx9+mBkzZlR4rqSkhKeeeorw8HAOHDhAeHg4Tz75\nJFqtFoAVK1awfft21q9fz7p164iMjOTHH3+sUtn64HY+j7S0NKKjoxk0aFC57YsXL+bPP/80fD+8\n8sor1R3mTel0OqA0YUVHR5Oenl6rx69Mo0gS11JKVTiZPvnkE1588UWg7Ffi6tWrCQ4OJjAwkBUr\nVnD8+HFGjhxJQEBAhVbHqlWruO+++wgMDOTRRx/lwoULlR578uTJAPj7++Pn58fRo0dRSvHpp58y\ncOBA+vbty0svvURubm6l5S9fvszMmTPp1asXgYGB5X5p+vj4cP78ecPja3/979+/nwEDBrBkyRL6\n9OlDv3792Lp1Kzt37iQsLIzAwEAWL15c6TF/+ukn1q9fz5IlS/Dz8+OJJ54wPHfq1ClGjhxJr169\nmD17NsXFxYbnduzYQXh4OL169WLixIn8/fffle4fIDY2lunTpxMYGMiwYcPYvHnzDV97s/0OHDiQ\npUuXMnLkSO655x5eeeUVMjIyeOyxx/Dz82P69Onk5OQYXn/kyBEmTJhAr169CA8PZ//+/YbnJk+e\nzIcffsjEiRPx8/PjkUceISsry/AclP8cExMTmTx5Mv7+/gQFBTF79uwbvodt27Zx//33ExAQwJQp\nU4iLiwNg6tSpREdHs2DBAvz8/Dh37lyFskFBQQwdOhQXF5cKz+3fvx+dTseUKVMwNzdn8uTJKKXY\nt28fAL/88gvTp0/H1dUVV1dXpk2bxpo1awCIjo6+adlrbdq0qcJlmmXLlvHkk08CUFxczFtvvUVI\nSAj33nsv//znP8udG1u3biU8PJyePXsyZMgQdu/ezfvvv8+hQ4d4/fXXy/2C//PPPxk7diy9evVi\n3LhxHD58uNxn9P777zNx4kR8fX1JSkpi9erVDB48GD8/PwYPHsyGDRsq/Qz27NlDly5dsLCwKLf9\nRonm/PnzTJ06lcDAQIKCgnjhhRfK/Z+eOHHCcJnw2Wef5fnnny/XErjVefvFF18Yzlu9Xo+FhQVd\nunRh9+7dlcZTq1QjExISoqKiospt+/jjj9WLL76olFIqKSlJdezYUb322muqqKhI7dmzR3Xr1k09\n9dRTKjMzU6WkpKigoCB14MABpZRSv//+uxoyZIiKi4tTOp1O/fe//1UPPPBApcdOSkpSPj4+Sq/X\nG7atXLlSDRkyRCUlJan8/Hz19NNPG2K53rvvvqtee+01pdPplFarVQcPHjQ85+PjoxITEw2PX3rp\nJfXBBx8opZSKjo5WnTt3Vp9++qnSarXqp59+Ur1791Zz5sxR+fn56syZM6pbt27q/PnzlR732n1d\nW4/jxo1TaWlp6sqVK2rYsGFqxYoVSiml/vrrLxUUFKSOHTum9Hq9WrNmjQoJCVHFxcUV9p2fn68G\nDBig1qxZo/R6vTp58qQKDAxUZ8+erXDsW+03JCREPfDAAyojI0OlpqaqoKAgFRERoU6dOqWKi4vV\nlClT1CeffKKUUiolJUUFBASoXbt2KaWUioqKUgEBASozM1MppdSkSZNUaGioOnfunCoqKlKTJk1S\n77777g0/x9mzZ6vPPvtMKaVUUVGROnToUKV1GRcXp3x9fVVUVJTSarXqiy++UKGhoaqkpMRw3JUr\nV1Za9lo//fSTmjx5crltX331lXrsscfKbXv88cfVV199pZRSqmfPnuro0aOG544fP678/PyqVPZa\nBQUFys/PT507d86wbcyYMWrTpk1KKaXeeOMN9cQTT6js7GyVl5enZs6cqd577z2llFJHjx5VPXv2\nNPwPpqamqri4uErfe1ZWlurVq5dat26d0ul0asOGDapXr14qKyvL8PqQkBB19uxZpdPpVE5OjvLz\n81MJCQlKKaXS0tIM59H13nrrLbVgwYJy2yr7brjq3LlzKioqSpWUlKjMzEw1adIktWjRIqWUUsXF\nxSokJER9++23SqvVqi1btqguXbrc1nkbHh6uUlJSVFFRkeGYr7/+uvr3v/9daTy1qdG1JKpCo9Hw\n1FNPYWFhQZ8+fbC2tmb48OE4Ojri5uaGv78/J0+eBODHH39kxowZtG3bFhMTE2bMmEFMTAwXL168\n4f7VNb9WNmzYwMMPP0yLFi2wtrZm9uzZbNq0qdLr3WZmZqSlpZGUlISpqSk9e/asdJ+VMTc3Z+bM\nmZiamnLfffdx+fJlpk6dirW1Nd7e3nh7e9/0135lpkyZgrOzM/b29oSEhHDq1CkAVq5cyYQJE+jW\nrRsajYbw8HAsLCw4evRohX3s2LEDT09PwsPD0Wg0dOrUiSFDhvDrr79WeG1V9jtp0iSaNWuGq6sr\n/v7+9OjRAx8fH8zNzQkNDTXEuG7dOoKDg+nXrx9Q+gu9a9eu7Ny507Cv0aNH06pVKywsLBg2bJih\n7FXX1rmZmRnJycmkpqZiYWGBn59fpXW2efNmgoODCQoKwtTUlEceeYTCwsJyv5DvVH5+PnZ2duW2\nNWnSxPCL9/rn7ezsyM/Pr1LZa1lZWTFo0CDDr/SEhATi4+MNl25WrVrFvHnzsLOzw8bGhhkzZhhe\nu2rVKsaOHUtQUBAArq6utG3bttL3ExkZSZs2bRgxYgQmJiYMHz6cdu3asWPHDsNrIiIi8PLywsTE\nBFNTU0xNTTl9+jRFRUU4Ozvj5eVV6b5zcnKwtbWtsP2pp54iICCAXr16ERAQwMqVKwFo1aoVQUFB\nmJmZ4ejoyNSpUzlw4ABQ2iLV6XRMmjQJU1NTQkND6d69u2GfVTlvp0yZgpubW7mWja2tLdnZ2ZXG\nX5vMjB1AXeXk5GT428rKCmdnZ8NjS0tLwz/XhQsXWLhwIW+99RZQdr06NTUVDw+PWx7n0qVLNG/e\n3PC4RYsWaLVa0tPTcXV1LffaRx99lI8//pjp06ej0WgYN25cpdemK+Pg4IBGozG8n8re49X3VFXX\nlre2tiYtLQ0orZO1a9caRucopdBqtVy6dKnCPi5cuMCRI0cICAgwvFan0xEeHl7pa2+132tjsrS0\nrPD42s9t8+bNhi+cq/u6+uUFlPvMra2tb1o/c+fO5YMPPmDs2LGGfoPKRs5c/3lrNBo8PDxITU29\n4b6rysbGpsKXem5uLk2aNKn0+dzcXGxsbKpU9nrDhw/n7bff5sknn2TDhg0MHjwYCwsLMjMzKSgo\nKPfe9Xq9IaGmpKQwYMCAKr2f6+sKoHnz5uXqyt3d3fC3tbU177//PkuXLmX+/Pn07NmTuXPn0q5d\nuwr7tre3Jy8vr8L2Tz/9lN69e1fYnpmZyRtvvMHBgwfJz89Hp9Ph4OAAlPZvuLm5lXv9tf/7VTlv\nr30fV+Xl5WFvb19he22TJHGX3N3deeKJJ6rUAX71S/parq6u5fowkpOTMTMzK/cFdZWNjQ3/+Mc/\n+Mc//kFsbCyTJ0+me/fu9O7dG2trawoKCgyvTUtLq/TEqw3u7u7MnDmTxx9//Jav9fDwIDAwkKVL\nl1brfqty3PDwcBYsWHDbZSv7HJ2cnAx9VYcOHWLatGkEBATQsmXLcq9zdXXlzJkz5bZdvHixWj6r\n9u3bs2zZsnLbTp8+behD8fb2JiYmhm7dugGlfUrt27e/adnKRlgB3HvvvcybN4+YmBg2btzI/Pnz\nAXB0dMTa2poNGzZU+JEDpZ/htX1n17q+Xl1dXdmyZUu5bRcuXKB///43LNO3b1/69u1LcXEx77//\nPq+++irLly+vcKyOHTuydu3aCttv1CJ/99130Wg0bNiwAXt7e7Zu3WroN3FxcamQ5C9evEirVq0M\n7/lOztu4uDhGjhx5W2VqglxuqsStLt1ca+LEiSxevJizZ88Cpc3Yyi6VADRr1gwTExMSExMN24YP\nH86yZctISkoiLy+P999/n+HDh2NiUvGjiYyMNJS1sbExNK+htON6w4YN6PV6du3aZWgKVwdnZ+cb\n/mNXZvz48axYsYJjx44BpZcydu7cWekv8eDgYOLj41m7di1arZaSkhKOHz9u6My90/3eysiRI9m+\nfTu7d+9Gr9dTVFTE/v37q/SLvrLP8ddffzWUtbe3x8TEpNLPcNiwYURGRrJv3z60Wi1Lly7F0tIS\nX1/fKsWt1+spLi5Gq9WW+xsgICAAExMTvv32W4qLiw2/XAMDAwEIDw9n2bJlpKamkpqayrJlyxg9\nevRNy1b2qxrA1NSUsLAw3n77bbKzs+nbty+AoYW7aNEiMjMzAUhNTTV0wI4dO5bVq1ezb98+lFKk\npqYaPuvrz7MBAwZw7tw5Nm7ciE6nY9OmTcTFxRESElJpTBkZGWzfvp2CggLMzMwM/yOV6du3LydO\nnCjXoX4zeXl52Nra0qRJE1JTU8v9qPH19cXU1JTly5ej0+nYunWr4RyFOztvi4uLOXHihKFejanR\nJYnKfgXe6jU3ezx48GAee+wxnn/+efz9/Rk5ciR//PFHpfu1srJi5syZTJw4kYCAAI4dO8bYsWMZ\nNWoUkyZNIjQ0FGtr6xsOu0tISODhhx/mnnvuYeLEiTz00EP06tULgJdffpnt27fTq1cvNm7cyODB\ng+/qPV5r7NixnD17loCAAJ5++ulbvr5r1668/vrrLFiwgICAAMLCwgyjaK5na2vLl19+yaZNm+jX\nrx/9+vXj3XffrfSf91b7vZ335O7uzqeffsrixYsJCgoiJCSEL7/80vAD4WZlK/scjx8/zrhx4/Dz\n8+Opp57i5ZdfpkWLFhXKtm3blnfeeYfXX3+doKAgIiMj+eyzzzAzM7vlcQHWrl1L9+7dWbBgAYcO\nHaJHjx6GG0HNzc359NNPWbNmDQEBAaxevZpPP/3UsO8JEyYQEhLCyJEjGTlyJCEhIYwfP75KZSsz\nfPhw9u7dy7Bhw8olxBdeeIHWrVszfvx4/P39mT59OgkJCQB0796dRYsWsWjRInr27MmUKVMM/XdT\npkzh119/JTAwkIULF+Lg4MBnn33G0qVL6d27N0uXLmXx4sU0bdq00rrS6/V89dVX9O/fn969e3Pg\nwAFee+21SmN3cnKid+/ebN26tdz2J554wnBPlZ+fH7NmzQLg6aef5q+//sLf35+ZM2cSFhZmKGNu\nbs7HH3/MypUr6dWrFxs2bGDgwIGG/oXbPW+hdARcYGBgpaPYaptG3c7P5ts0f/58IiMjcXJyYv36\n9QDExMTw2muvUVRUhJmZGa+99pqh+SuEELUlNjaWl156ydA5XZ3Gjx/PxIkTiYiIuKPyDzzwAAsX\nLrzpPTO1pUaTxMGDB7G1tWXu3LmGJPHII48wbdo07r33Xnbu3MmSJUv49ttvayoEIYSocQcOHKBt\n27Y4Ojqybt06/vWvf7F169ZK+xbrmxrtuPb39yc5ObncNo1GY7ihKScnp8KoACGEqG/i4+N57rnn\nyM/Pp1WrVnz00UcNIkFADbckoHS0zsyZMw0tidjYWB599FHDnc8rVqyo0lBRIYQQta/WO65/+OEH\nXn75ZSIjI5k3b55h6JwQQoi6p9aTxC+//GIYeTN06NByQ8VupoYbPEIIUe9l5xVz/Gx6tX5f1vjN\ndNcH6+bmxv79+wkICGDv3r20adOmSvvRaDSkpeXc+oWNgIuLndTF/0hdlJG6KNPY6iL9SgFb9p9n\n17ELFJfo+ee0XrRyK51mxcXF7halb65Gk8ScOXOIjo4mKyuL4OBgZs2axeuvv84bb7yBXq/H0tKy\nVtZxEEKIhijjSiG/7I5j71+p6JWimb0lw4Jb09K18ulU7kSNd1xXp8b0y+BmGtuvpJuRuigjdVGm\noddFXmEJG/eeY+vBJLQ6Pc2dbRkW2IrAzm6YmZbvRajTLQkhhBDV61RCJv9de4LcghKa2VsS0a8d\nQV3cMTG59WwSd0KShBBC1BORR5JZvuU0AGODvRjc0xML88rnp6oukiSEEKKO0+n1/LQ9lt8PnqeJ\ntTlPj+5Gh5YOtXJsSRJCCFFHXczIY/fxi+z9K4Ws3GI8nGx4dlwPXB2say0GSRJCCFHHFBRp+WL9\nSY6cTQfA2tKMgX4tGN3fCxur2v3aliQhhBB1SGZ2IR+sPEZSWi7enk0Z5OfJPe2da7zv4UYkSQgh\nRB1x/lIuH6w8yuWcIkL8WvDQ4A41NmqpqiRJCCGEESmlOJt8hT3HU4g+mUpRiY7xId6EBbSs0iJp\nNU2ShBBCGMmhvy+xMjKWS5dL16d3tLNk+vBO9PKpuD64sUiSEEKIWqbXK9b8EcfGvecwNzMhqIsb\nfbp50KmVo9EvL11PkoQQQtSi/MISFq87yfG4DFwdrZk1uhstXKpvrqXqJklCCCFqScaVQt798Qgp\nmfl0bdeMx0d2wdbK3Nhh3ZQkCSGEqAUX0vN498cjXM4pYkivlowP8a5zl5YqI0lCCCFqWNyFbD5Y\neZTcghLGhXgxLLC1sUOqMkkSQghRA5RSJKTksPvYRfb8dZESrZ5pw3zo16O5sUO7LZIkhBCimh2L\nzWBV5FmS0vIAcGhiweQRHbmng4uRI7t9NZok5s+fT2RkJE5OTqxfv96w/dtvv2X58uWYm5szYMAA\nXnjhhZoMQwghas3Wg+f5YdsZTDQaenZ0oV93D7q0bYapicmtC9dBNZokRo8ezeTJk5k7d65hW3R0\nNDt27GDDhg2YmZmRmZlZkyEIIUSt0OsVK7afYevBJOxtLXh2bHfaetgbO6y7VqNJwt/fn+Tk5HLb\nfvjhBx577DHMzEoP3axZs5oMQQghaoRerzgWm0Fyei5pWYWcS83hXEoOzZ1teW5sd5xrcTrvmlTr\nfRIJCQkcPHiQ999/H0tLS+bOnUu3bt1qOwwhhLhjWp2epRtPEX0ytdz27l5OzBjRGZs6fu/D7aj1\nJKHT6cjOzuann37i2LFjPPfcc2zbtq1KZe92Qe+GROqijNRFGamLMjVVF0UlOv799QEOnkqlU5tm\njB3UHrdmNrg52mBl2fDGAtX6O3J3d2fIkCEAdO/eHRMTEy5fvoyjo+Mty6al5dR0ePWCi4ud1MX/\nSF2UkbooU1N1UVCk5eOfjxGTmEWXts14OqIblhal6zzkZBdQF2v/bpNljXe3K6XKPR48eDB79+4F\nID4+Hq1WW6UEIYQQxhR/MZvXvz5ITGIWPTu68MyY7oYE0ZDVaEtizpw5REdHk5WVRXBwMLNmzWLM\nmDHMmzePESNGYG5uzltvvVWTIQghxF3R6fVs3HuO9XsS0OkVQ3q1ZFyIV70d0nq7NOr6n/p1mDSl\nS8llhTJSF2WkLspUV12kXylg8boTxCZn42hnySPDO9G5Tf0akXm3l5saXi+LEEJUgyNn01m64SR5\nhVoCOrkyOaxjnZ+xtSZIkhBCiGsUlehY+0c8v+5PxMzUhKlDO9K/R/M6sZSoMUiSEEIIILeghO2H\nkth6KIncghLcmtnwxKgutHJr3MOKJUkIIRo1vV6xYW8Cm/ado7hEj62VGff3acOwwFZYN8D7Hm6X\n1IAQotG6klfM5+tOcOrcZRyaWDCmf2v69fDAykK+Gq+SmhBCNEp/J17ms7UnuJJXjK+3M4/c36lR\ndkzfiiQJIUSjUqLV88vuOH6NTkSDhnEhXgwNaNVoO6ZvRZKEEKLRSEzNYcmGkySl5eHc1IrHRnSm\nvaeDscOq0yRJCCEavBKtno17E9i49xw6vWKAb3PGh3hLx3QVSA0JIRq00+ez+PrXGC5m5ONoZ8nU\noT5093Iydlj1hiQJIUSDlH6lgLV/xLPnrxQ0wCA/T0YPaCeth9sktSWEaFCu5Baxenc8m6NKJ+Tz\ndLFlylAfvFs0NXZo9ZIkCSFEvafT6zkRn8nu4ykcOZOGVqdwcbAi/N52BHZ2w8RERi7dKUkSQoh6\nq6hYx/Y/k9hy8DxXcosBaO5sS/gAL3zbNcPMtHFM512TJEkIIeqdohIdO/5MZnP0OXLyS7C2NCPE\nrwX3dvOgjbsdrq72Mm16NZEkIYSoV1Iz8/lw1TFSMvOxtjRlZN82DOnVEhu5W7pG1GhbbP78+fTp\n04cRI0ZUeG7p0qX4+PiQlZVVkyEIIRqQUwmZvPHNQVIy8xnU05O3ZvYhvF87SRA1qEaTxOjRo1m6\ndGmF7SkpKURFRdG8efOaPLwQooEo0erYevA87/10lMJiHdPu8+Gh0A40sZbkUNNq9HKTv78/ycnJ\nFbYvWrSIuXPn8sQTT9Tk4YUQ9ZhSiuNxGUSfvMSRs2kUFOloYm3O06O70aGlTKVRW2q9T2L79u14\neHjQsWPH2j60EKIeWRUZy+boRACc7K0Y4NuCQX6eODW1MnJkjUutJonCwkI+++wzvvzyS8M2pVSV\ny9/tgt4NidRFGamLMg2lLk7EZfDr/kQ8nG2Z/aAfHVs53vYsrQ2lLoytVpNEYmIiycnJjBo1CqUU\nqampjBkzhpUrV+LkdOu5VGRIWykXFzupi/+RuijTUOqiqFjHe98fAgXThvrgZGNOenrube2jodRF\ndbjbZFnjSeLalkKHDh3Ys2eP4fHAgQNZs2YNTZvK7fJCiFKrdsZy6XIBQwNa4e0p3w3GVqOjm+bM\nmcOECROIj48nODiYn3/+udzzGo3mti43CSEatphzl9l2KAkPJxvC+7U1djiCGm5JvPvuuzd9ftu2\nbTV5eCFEPXL4TBpfbjyFRgOPDO+MhbmpsUMSyB3XQggj0+r0rIqMZcuB85ibmTD9vk60a25v7LDE\n/0iSEEIYTWJqDl//+jfxF7Nxb2bDE+FdaenaxNhhiWtIkhBC1LrcghLW7Ioj8kgySkFQFzcmh3XE\nykK+kuoa+USEELXm6uytG/cmkFeoxcPJhomD29O1rSwnWldJkhBC1LgSrY7IIxfYtPccV/KKsbY0\nY8JAbwb29JQ1H+o4SRJCiBqhV4qzSVfYdzKVA6dSySvUYmlhyv19WhMW0Apbmbm1XpAkIYSodhcz\n8vho1TFSLxcA0NTWgvt6tyYsoCV2NhZGjk7cDkkSQohqdTmniPd+PEJGdhG9u7jRt6sHnVo7yjrT\n9ZQkCSFEtckvLOG9n0oTRET/dozo08bYIYm7JD1GQohqUVSi46NVx0hOy2OQnyf3B7U2dkiiGkhL\nQghxx/ILtRw9m87hs+n8FZdBYbGOXj6uTBzc/ran9hZ1kyQJIcQdOZt8hU9+PkZ2fgkALg5WDOrp\nyci+baX/oQGRJCGEuG37TqTw5aYY9HrF/X1aE9jJjebOttJ6aIAkSQghqkwpxdrd8azbk4C1pSlP\nhHeTu6UbOEkSQogqUUqx/PfTbP8zGeemVjw7rgctnG2NHZaoYZIkhBC3pFeK5VtOs+NwMp4uTXhh\ngi/2tnJTXGNQo0li/vz5REZG4uTkxPr16wF4++232bFjBxYWFrRq1Yo333yTJk1kamAh6iq9Uny3\n5TSRh5Np6VqaIOSu6cZDo2pw/dCDBw9ia2vL3LlzDUkiKiqK3r17Y2Jiwn/+8x80Gg1z5syp0v5k\nYfNSssh7GamLMtVVF4XFWtbtTiD2whWy84rJzi+moEhHK9cmvDDxHppY1/05l+S8KOPiYndX5Wu0\nJeHv709ycnK5bX369DH87evry2+//VaTIQghbkNs8hW+WH+SS1kFaDRgZ22Ok70VLVya8FBoh3qR\nIET1MmqfxKpVqxg+fLgxQxBCAHq9Yn1UAuv3JKCUYlhgK8L7tcPcTCZlaOyMliT++9//Ym5uzogR\nI6pc5m6bTQ2J1EUZqYsyd1IX+YUlvPPdIQ6eSsXZwZrZE/3o5u1cA9HVLjkvqodRksSaNWvYuXMn\n33zzzW2Vk2uMpeR6axmpizJ3UheZ2YV8uOoY5y/l0qVtM2aO6oKtlXm9r1M5L8rU6T4JKB1bfa1d\nu3axZMkSvvvuOywsZISEEMZyLiWHD1cdJSu3mGDf5jwY2kFWiRMV1GiSmDNnDtHR0WRlZREcHMys\nWbNYvHgxJSUlTJ8+HYAePXrwz3/+sybDEEJc5+/Ey3y46hhFxTrGh3gTFtBSptQQlarRIbDVTZqP\npaQpXUbqokxV6+LI2XT++8tf6PWKx0Z0JqCTWy1EV7vkvChT5y83CSGMK7eghKycIrLzi0lMzWVV\nZCxmphqeGdudbu1k3iVxc5IkhGjA/jh6gW9++xudvuyCgY2lGc+O6057TwcjRibqC0kSQjRQx2Iz\n+PrXv7GxMqNXJ1fsbSywszGnWzsnXBysjR2eqCckSQjRACWkZPPfX/7C1FTDs2O749WiqbFDEvWU\njHcTooFJSsvlg5XHKC7RMWNEF0kQ4q5IS0KIBuDvxMus3h3PgRMppF4uAOCh0A707Ohi5MhEfSdJ\nQoh6rKhEx4/bzxJ5uHQiTUsLU+5p70xAJzcCOze8oa2i9kmSEKKeSrqUy2frTnAhPQ9PF1seH90d\nN3tLuWtXb7olAAAgAElEQVRaVCtJEkLUMyVaPb/tT2TdngS0Oj2D/DwZF+JFi+YOcgOZqHaSJISo\nR04mZPLdltOkZOZjb2PO1GFduKe99DuImiNJQoh6oLBYy/LfT7PneAoaYKBfC0b3b4eNlSwCJGqW\nJAkh6rjE1Bz++8tfpF4uoLWbHVOHdaSNu72xwxKNhCQJIeoopRTb/0zmx+1n0OoUQwNaMXpAO+mY\nFrVKkoQQdZBSip93xrFp3zmaWJvz6P2d6O5V/1eLE/WPJAkh6hi9UqzYeoath5Jwc7TmhQn34NTU\nythhiUbqlkni/PnzrFq1iujoaFJSUrC0tMTHx4ewsDCGDBmCmdmNdzF//nwiIyNxcnJi/fr1AFy5\ncoXnn3+e5ORkPD09+eCDD7Czk7VoReNUotVzObeIgkItCoVSsPNIMruOXqSFiy0vPOBL0yaWxg5T\nNGI3XXTo//7v/zhx4gRDhw7lnnvuwdnZmaKiImJjY9m9ezcnT57kn//8J76+vpWWP3jwILa2tsyd\nO9eQJN555x0cHBx47LHH+Pzzz8nOzuaFF16oUrAyBryULKhSpj7WRVpWAd/8GsP5S7lk55dU+prW\nbnbMmeBLE+uqj16qj3VRU6QuytTookODBg1iwYIFFbZ37NiR++67j6ysLM6fP3/D8v7+/iQnJ5fb\ntm3bNr777jsAIiIimDx5cpWThBD1XWzyFT76+Rg5+SW4OlrTwqUJjnaW2FiZYaLRoNGAjZU5g/xa\nyPBWUSfcNEkMGDDgpoUdHBxwcLi9hUsyMzNxdi7tgHNxceHy5cu3VV6I+upAzCWWbDiJTqeYPKQD\nIX6exg5JiFuq0li6f//73+Tk5KDVannwwQfx9fVl7dq1NR2bEPVadl4xO48k882vMfxr2YHS9R1M\nNDw7rrskCFFvVGl0U1RUFC+99BKRkZG4ubnx/vvvM2PGDEaNGnXbB3RyciI9PR1nZ2fS0tJo1qxZ\nlcve7bW1hkTqokxdrIviEh3zv9hHSkY+AGamJnRq04wnx/agjUfN3QhXF+vCWKQuqsdtDYE9cOAA\noaGhuLm5odFoqlTm+n7xgQMHsnr1ambMmMGaNWsYNGhQlY8vHVGlpFOuTF2ti1+jE0nJyCeoiztD\nerWkhYut4Sa4moq3rtaFMUhdlLnbZFmly01OTk688sorbNq0ib59+6LVatHpdLcsN2fOHCZMmEB8\nfDzBwcH8/PPPzJgxg6ioKMLCwti7dy8zZsy4qzcgRF2TW1DChqgEbK3MeDC0Pa3d7eQuaVFvVakl\n8e6777Ju3TrGjh1L06ZNSUpKYtq0aVUqV5lly5bdVpBC1CcbohLIL9LywEBvbGWEkqjnqpQkmjVr\nxsMPP2x47OnpiaendLwJcb20rAK2/5mEc1MrBkrntGgAbtoGfvLJJzl27Filz+Xm5vL111/z448/\n1khgQtRHa3bFodUpRvdvh7mZXGIS9d9NWxLPPPMM7777LgkJCXTv3h0nJyeKioqIi4sjOTmZCRMm\nMHHixNqKVYg6Sa9XHI1NZ+vBJE6du0xrdzsCZH1p0UDcNEn4+PjwxRdfcPHiRfbv309qaiqWlpYM\nHTqUnj17YmFhUVtxClEnnU26wpINJ7mUVQBAp9aOPBTaAZMqjv4Toq6rUp+Eh4fHHd0TIURDlltQ\nwqe/HCc7r4T+PTwY3LMlnq5NjB2WENWqShdNMzIyeOGFF3jooYcAiImJ4YcffqjRwISo677b8jdZ\nucWE92vLw8M6SYIQDVKVksQrr7xCz549yc7OBqBdu3Z8//33NRqYEHXZvpMp7D91Ce8WTRnWu5Wx\nwxGixlQpSaSmpjJx4kRMTU0BsLCwwMRERm6Ixikzu5DvfjuNpbkpj97fCVP5XxANWJXO7usXFsrO\nzq4w3YYQjUFqZj6frD5OfpGWCYO8cXW0MXZIQtSoKnVcDxkyhP/7v/8jLy+P1atX8/333zNmzJia\njk2IOkOr0/Pb/kTW7k5Aq9PTp6s7/Xs0N3ZYQtS4KiWJRx99lHXr1pGdnc3OnTuZPHmyjHYSjYJS\nimOxGfy8M46ktFzsbS14KLQD/h1dqjzJpRD1WZVngR05ciQjR46syViEqDZanf6uJtXT6vT8FZfJ\nuj3xJKSUziZ6b3cPmY9JNDpVShIZGRl89913JCYmotVqDds//PDDGgtMiDuVmJrDv5Yd4KmIbvh1\ncKlSGaUUv/wRz7HYDC7nFBrWntYAvXxcGdG3DZ4uMsRVND5VShJPPvkknTt3JigoyDDCSYi66kzS\nFZSCY7EZVU4S6/YksD4qAXMzE5rZWdLc2Ra3ZjYM6ukpyUE0alVKEgUFBbz22ms1HYsQ1eLS5dIp\nMs6lVm3RmX0nU1i7Ox7npla8MsUfe1uZbkaIq6qUJHr06MHff/9Nx44dazoeIe5a2v/mUUpOy63Q\nN/H7gfOsj0qgh7cTfbt6YGZmwpcbY7C2NOXZsd0lQQhxnSoliQkTJjBp0iTc3d2xtLQ0bF+1atUd\nH3jZsmWsWrUKjUZDhw4dePPNN2XCQFEtrk62p9UpLqTn0cqtbPnGvSdSyC0oYc/xFPYcTwFAo4Fn\nRvWghVxWEqKCKiWJF198kZkzZ9K5c+dq6ZNITU3l22+/ZfPmzVhYWPDcc8+xadMmwsPD73rfonHT\nK2VoSUDpJaerSaKwWEtiai5eze0ZG+zF7uMXOR6bQXj/dnRt52SskIWo06qUJCwtLXnkkUeq9cB6\nvZ6CggJMTEwoLCzE1dW1WvcvGqcrucWUaPU42VuRkV1IYmqu4bnYC9nolaJDSwc6tnKkYytHI0Yq\nRP1QpYHk/fr1Y9euXdV2UDc3N6ZNm0ZwcDD9+/fHzs6OPn36VNv+ReN16XI+APd0cMZEoynXeX3m\nfBYA7T0djBKbEPVRlVoSP/30E59//jm2trZYWFiglEKj0bB37947Omh2djbbtm1jx44d2NnZ8cwz\nz7B+/XpGjBhxR/sT4qqr/RGeLk3wcLLhfGoueqUw0Wg4k3QFAG/PpsYMUYh6pUpJ4ueff67Wg0ZF\nRdGyZUscHEp/0YWGhnL48OFbJgkXF7ubPt+YSF2UubYu8or1ALRv04xzl3JJPpRECRrcm9kSdzGb\nVu52tG3VzFih1jg5L8pIXVSPKiWJFi1aVOtBmzdvztGjRykqKsLCwoJ9+/bRrVu3W5ZLS6vauPeG\nzsXFTurif66vi4Tk0ktKFoCbgzUAh0+l4OZoQ1GxjnYe9g227uS8KCN1UeZuk+VNk8SLL77IO++8\nw5gxYyqdzOxOh8B2796dsLAwwsPDMTMzo3PnzowfP/6O9iXEtdKyCjAz1eBoZ0lrt9IhrYmpuVzJ\nLQagvVxqEuK23DRJXLp0CYB//OMf1X7gp59+mqeffrra9ysat7SsQpyaWmNioqGla+kvqHMpOVhb\nlp7qHaTTWojbctMkcXW50oCAgFoJRoi7kV+oJbeghLYe9gDYWJnh6mBNYmoOJiYamtlb4tTUyshR\nClG/yLqLosG4ehOd6//6IgBauduRV6glJ79EWhFC3IGbtiROnz5NUFBQhe13OwRWiJpwdfiri2NZ\nkmjt1oSDMaWXTaU/Qojbd9Mk0aZNGz7//PPaikWI25KTX4xdcdn6JldvpLu2JdH6mnmb2reUloQQ\nt+umScLCwqLah78KUR2KS3S8siQab08HZo0uHT6dVklL4uq8TbZWZjR3tq39QIWo526aJMzNZZlG\nUTfFJGaRk1/C4dNpnEvJobW7nWEdCZdrOqftbS3o29UdFwdrTGRNaiFu2007rn/66afaikOI23Is\nNt3w9+8HzwOlLQlHO0sszMvPVPzI/Z0ZeW/bWo1PiIZCRjeJekcpxbHYDKwtzWjh0oTok6mkXykg\nM7sIl2v6I4QQd0+ShKh3Lmbkk36lkC5tmzGqfzt0esWqyFgU4OIg90EIUZ0kSYh651hsBgA9vJwI\n6dkSWysz9p8qHebqKi0JIaqVJAlR71ztj+jazgkrSzMG+JaNwLt2ZJMQ4u5JkhD1Sn6hljNJV2jr\nYUdT29I10Qf6tcDUpHTkkquDjTHDE6LBkSQh6pWTCZno9Ipu16xJ3czeij5d3bG2NMPDSZKEENWp\nSutJCFFXGPojvJ3LbZ8ytCMPDGxvmO1VCFE95D9K1Bt6pTgel4G9jTmt3csvpGJqYoKNlTSMhahu\n8l8l6o0T8ZlcySumWzsnuXtaiFpitJZETk4OL7/8MmfOnMHExIRFixbRo0cPY4Uj6jCtTs+GqAQ2\nRJ1DAwR1dTd2SEI0GkZLEgsXLmTAgAF89NFHaLVaCgsLjRWKqMMupOexdONJ4i/m4GRvySPDO+PT\n2tHYYQnRaBglSeTm5nLw4EH+/e9/lwZhZkaTJk2MEYqoo/ILtazbE8+2Q0no9IqgLu48FNoBGyvp\nRhOiNhnlPy4pKQlHR0fmzZtHTEwMXbt25eWXX8bKSqZUaOwKirTsO5nK2j/iyM4vwbmpFRMHteee\nDi7GDk2IRkmjlFK1fdC//vqLBx54gBUrVtCtWzcWLlyInZ0dzzzzTG2HIuoApRSH/05j+8Hz7P3r\nIsUlOqwsTBk3qAPhA7wqzOoqhKg9RmlJuLu74+7uTrdupYvFhIWFsWTJkluWS0vLqenQ6gUXF7sG\nUxdZuUV8ufEUf8VnAuDmaE1QV3f6dW+Oo50lV7Lyb1q+IdXF3ZK6KCN1UcbFxe7WL7oJoyQJZ2dn\nPDw8iI+Pp23btuzbtw8vLy9jhCKM6PDpNL7aHENuQQld2zVjVN+2tGtuj0aGtwpRZxitF/CVV17h\nhRdeQKvV0rJlS958801jhSJqWVGxjhXbz7DzyAXMzUx4KLQDA/1aSHIQog4yWpLw8fHh559/Ntbh\nhZEkpGSzeN1JUjPz8XRpwuMjO9PCRUa2CVFXyXhCUSu0Oj2/7U/klz/i0ekVYQEtGd3fC3Mzuelf\niLpMkoSoUVeXGv1x+1lSMvNp2sSCR4d3pkvbZsYOTQhRBZIkRI1JTM1hVWQsf8VnotFAiF8LIvq1\no4m1ubFDE0JUkSQJUe0SU3NYtyeBP0+nAdCljSMPDGqPp/Q9CFHvSJIQ1SY9q4CVkbEciCldb9qr\nuT2j7m1Ll7bNZOSSEPWUJAlx1wqKtGzad47f9p9Hq9PT1sOO8H7t6CrJQYh6T5KEuGNKKfaeSGFl\nZCxXcotxtLNkbLAXgZ3dZL0HIRoISRLijpxLyWH576c5m3wFczMTRvZtw7DA1lhayDxLQjQkkiTE\nbSko0rJmVxzbDiWhgJ4dXHhgoDfODtbGDk0IUQMkSYgqOxabwbe/xZCRXYRbMxsmDelAlzZyv4MQ\nDZkkCXFLl3OK+GnHWaJPpmJqouH+Pm0Y0ac15mZyaUmIhk6ShLghrU7P1oNJrN0TT1GxjrYe9kwb\n5oOnq9zvIERjIUlCVKCU4sjZdH7eGceF9DyaWJszYag3/Xo0l1FLQjQykiREOafOXWb1zlhiL2Sj\n0cAA3+aMGeAlU2kI0UhJkhBA6Qpxy7ec5tD/ptLo2cGF8P7taOFsa+TIhBDGJEmikVNK8cexi/y4\n/SwFRVraezZlwqD2tPWwN3ZoQog6wKhJQq/XM2bMGNzc3Pjss8+MGUqjdC4lhxXbzvD3+SysLEyZ\nHNaRAb7S7yCEKGPUJPHNN9/g5eVFbm6uMcNodDKzC1mzK46ov1JQgK+3M5OGdKCZvZWxQxNC1DFG\nSxIpKSns3LmTmTNn8tVXXxkrjEalqETHb9GJbNp3jmKtHk+XJjwwyFtuiBNC3JDRksSiRYuYO3cu\nOTk5xgqh0VBKcSDmEj/tOEtmdhFNbS14KLQdfbt5YGIil5aEEDdmlCQRGRmJs7MznTp1Ijo6usrl\nXFzsajCq+qUqdaGUYv+JFFZsPc3Z81mYmZowdmB7xg1qj41VwxnSKudFGamLMlIX1UOjlFK1fdD3\n3nuPdevWYWpqSlFREXl5eYSGhvL222/ftFxamrQ6oPTkv1VdHDmTzpo/4jh/KRcN4O/jypgB7XB1\ntKmdIGtJVeqisZC6KCN1UeZuk6VRksS19u/fz5dfflml0U3yoZe62T/A5Zwilv9+mj9Pp6HRQGAn\nN4b3adNg73eQL4MyUhdlpC7K3G2SkPskGgi9Uuw8coFVkWcpKNLRoaUDU8I60ryBJgchRO0wepII\nCAggICDA2GHUa38nXuaHbWdITM3F2tKMqUM7yjxLQohqYfQkIe7cpawCVu44y6G/S6fSCOrixthg\nbxztLI0cmRCioZAkUQ/lF5awKjKWLQcS0eoUXi3smTioA+2ay1QaQojqJUmiHlFKEfVXCqt3xXE5\npwhHO0vGhXgR2MkNjVxaEkLUAEkS9URyeh7f/vY3p89nYWFuyqh72zI0sBWW5rI6nBCi5kiSqOOK\ninWsj0rgt/2J6PSKe9o78/QD96DR6owdmhCiEZAkUUcppYg+mcrKyFgu5xThZG/FQ6Ed8G3vjIuj\njYwBF0LUCkkSddC5lByW/36as8lXMDM14f4+bRjeuzWWFnJpSQhRuyRJ1CG5BSWs3hXHzsPJKKBn\nRxceCPHG2cHa2KEJIRopSRJ1gFanZ/exi6zeFUduQQkeTjY8FNqBzjKFtxDCyCRJGJFWpyfqrxQ2\nRCWQfqUQSwtTxod4M9jfEzNTE2OHJ4QQkiSM5fCZNH7Yeob0K4WYmZowuKcn9wW1xqGJ3C0thKg7\nJEnUsit5xXz/+2kOxFzC1ETDoJ6e3Ne7tUylIYSokyRJ1BK9XvHHsQusiowlr1CLVwt7Hh7WqcFO\n4S2EaBgkSdSC0+ez+H7raRJTc7G0MOWh0A6E+LWQWVqFEHWeJIkalJZVwM87Y9l/6hIAQV3cGRvs\nJZeWhBD1hiSJGpBXWMLGqHNsPXQerU7R1sOeBwe3x6tFU2OHJoQQt8UoSSIlJYW5c+eSnp6Oqakp\n48aNY8qUKcYIpVoppfjj2EVW7jhLXqEWJ3tLxgzwIqCzm1xaEkLUS0ZJEqampsybN49OnTqRl5fH\n6NGj6du3L15eXsYIp1qkXs7n680xxCRmYWVhyrhgLwb7e2JuJlNpCCHqL6MkCRcXF1xcXACwtbXF\ny8uLS5cu1cskUVCk5fcD59m47xwlWj2+3s5MGtKBZvZWxg5NCCHumtH7JJKSkoiJiaF79+7GDuW2\nlGh17Dh8gY17E8jJL8He1oJH7++Af0cXWQBICNFgaJRSylgHz8vLY/LkyTz55JMMHjzYWGHclqIS\nHb9Hn+PnHWdJzyrA2tKMiGBvRvVvh42VubHDE0KIamW0JKHVann88cfp378/U6dOrVIZY66hUFyi\nY/ufyfy6P5HsvGIszEwI8WvBfb1bY2djUauxuLjYyXoS/yN1UUbqoozURRkXF7u7Km+0y03z58/H\n29u7ygnCmOIvZrNkw0kuZuRjZWHK8KDWhPq3xN62dpODEELUNqMkiUOHDrF+/Xo6dOhAeHg4Go2G\n559/nv79+xsjnBvS6vSs25PApr3n0CvF4J6ejOrXFlu5rCSEaCSMkiR69uzJqVOnjHHoKjubdIVv\nfoshKS0PJ3srpg/vRKfWjsYOSwghapXRRzfVNbkFJayKPMuuoxcB6N+jOQ8M9MbaUqpKCNH4yDff\nNf48ncayzTHkFpTg6WLL5LCOtPd0MHZYQghhNJIkKL0h7oetZ9h9/CLmZiaMC/Ei1L+lrA4nhGj0\nGn2SOJmQybLNMaRfKaSVWxMeG9FF1ngQQoj/abRJIuNKIT9uP8PBv9PQAMODWjPq3rbSehBCiGs0\nuiRRotXz6/5ENkYlUKzV49XCnkmhHWntfnc3nAghREPUqJLEsdh0vt96hkuXC7C3tWBymBdBXd1l\nGm8hhLiBRpEk0rIKWLHtDIfPpGOi0RDq35JR97bFxqpRvH0hhLhjDfpbsqhEx+Z959i0LxGtTk+H\nlg5MCu2Ap2sTY4cmhBD1QoNMEkopDv2dxo/bz5KRXYhDEwvGD/QmsJObTOMthBC3ocEliYSUbFZs\nPcPppCuYmmgYFtiK+/u0kTumhRDiDjSYb84recX8HBnLnuMXUcA97Z0ZH+KNWzMbY4cmhBD1Vr1P\nElqdnu2Hkli7J56CIh2eLrZMGNSezm2aGTs0IYSo9+p1kvg78TLfbjnNhfQ8bK3MmDSkAwN8m2Nq\nIjfECSFEdaiXSSK3oISfdpxl97GLaIBg3+ZE9G9X6yvECSFEQ2e0JLFr1y4WLVqEUooxY8YwY8aM\nW5ZRShF9MpXvt54ht6CElq5NmDrUh3bN7WshYiGEaHyMkiT0ej2vv/46y5Ytw9XVlbFjxzJo0CC8\nvLxuWCbjSgEfrTrG0dgMLMxNGB/iTWgvT7m0JIQQNcgoSeLYsWO0bt2aFi1aADB8+HC2bdt20yTx\n1NvbySvU0qm1I1OH+eDqYF1b4QohRKNllCSRmpqKh4eH4bGbmxvHjx+/aRm9gqlDO9K/R3O5IU4I\nIWqJUZKEUuq2yyx9JZTCvKIaiEYIIcSNGCVJuLu7c+HCBcPj1NRUXF1db1rGzsZCRi9dw8VFpja/\nSuqijNRFGamL6mGUXt9u3bqRmJhIcnIyxcXFbNy4kUGDBhkjFCGEEDdhlJaEqakpr776KtOnT0cp\nxdixY2/aaS2EEMI4NOpOOgiEEEI0CnKTgRBCiBuSJCGEEOKGJEkIIYS4oTqfJHbt2sXQoUMJCwvj\n888/N3Y4tSolJYUpU6Zw3333MWLECL755hsArly5wvTp0wkLC+ORRx4hJyfHyJHWHr1eT0REBDNn\nzgQgKSmJ8ePHExYWxuzZs9FqtUaOsHbk5OTwzDPPMGzYMIYPH87Ro0cb7XmxbNky7r//fkaMGMGc\nOXMoLi5uNOfF/Pnz6dOnDyNGjDBsu9l58MYbbzBkyBBGjRrFqVOnqnSMOp0krs7xtHTpUjZs2MDG\njRuJjY01dli1xtTUlHnz5rFp0yZWrFjB8uXLiY2N5fPPPycoKIjffvuNwMBAFi9ebOxQa80333xT\nbiTcf/7zH6ZNm8Zvv/2GnZ0dq1atMmJ0tWfhwoUMGDCAzZs3s3btWtq1a9coz4vU1FS+/fZbVq9e\nzfr169HpdGzcuLHRnBejR49m6dKl5bbd6DzYuXMniYmJbNmyhQULFvDaa69V6Rh1OklcO8eTubm5\nYY6nxsLFxYVOnToBYGtri5eXF6mpqWzbto2IiAgAIiIi2Lp1qzHDrDUpKSns3LmTcePGGbbt27eP\nsLAwoLQufv/9d2OFV2tyc3M5ePAgY8aMAcDMzAw7O7tGe17o9XoKCgrQarUUFhbi6upKdHR0ozgv\n/P39sbcvPwv29efB1e/Mbdu2ER4eDkCPHj3IyckhPT39lseo00misjmeLl26ZMSIjCcpKYmYmBh6\n9OhBRkYGzs7OQGkiuXz5spGjqx2LFi1i7ty5hrm7Ll++TNOmTTH530zA7u7ujeL8SEpKwtHRkXnz\n5hEREcGrr75KQUFBozwv3NzcmDZtGsHBwfTv3x87Ozs6d+6Mvb19ozsvrsrMzCx3HmRmZgJw6dIl\n3N3dDa9zc3MjNTX1lvur00lCbuEolZeXxzPPPMP8+fOxtbVtlBMcRkZG4uzsTKdOnQznhVKqwjnS\nGOpGq9Vy8uRJHnzwQdasWYO1tTWff/55o3jv18vOzmbbtm3s2LGDP/74g4KCAnbt2lXhdY2xbq5X\n2fdpVeqlTq9MdydzPDU0Wq2WZ555hlGjRjF48GAAnJycSE9Px9nZmbS0NJo1a/jref/5559s376d\nnTt3UlRURF5eHosWLSInJwe9Xo+JiQkpKSmN4vxwd3fH3d2dbt26ATBkyBC++OKLRnleREVF0bJl\nSxwcHAAYPHgwhw8fJjs7u9GdF1fd6Dxwc3MjJSXF8Lqq1kudbknIHE+loxe8vb2ZOnWqYdvAgQNZ\nvXo1AGvWrGkUdTJ79mwiIyPZtm0b7733HoGBgfznP/8hMDCQX3/9FWg8deHs7IyHhwfx8fFAab+M\nt7d3ozwvmjdvztGjRykqKkIpxb59+2jfvn2jOi+ubyHc6DwYNGgQv/zyCwBHjhzB3t7ecFnqZur8\ntBy7du1i4cKFhjmeqrLMaUNx6NAhJk2aRIcOHdBoNGg0Gp5//nm6d+/Oc889x8WLF2nevDkffvhh\nhc6rhmz//v18+eWXfPbZZ5w/f57Zs2eTnZ1Np06deOeddzA3Nzd2iDUuJiaGl19+Ga1WS8uWLXnz\nzTfR6XSN8rz45JNP2LhxI2ZmZnTu3Jk33niDlJSURnFezJkzh+joaLKysnB2dmbWrFkMHjyYZ599\nttLzYMGCBfzxxx9YW1vz5ptv0qVLl1seo84nCSGEEMZTpy83CSGEMC5JEkIIIW5IkoQQQogbkiQh\nhBDihiRJCCGEuCFJEkIIIW5IkoSocwYOHMjZs2dr5ViffPJJuWmk582bx/Lly+96v/PmzWPEiBHM\nnj37rvd1MzExMWzevLlGjyEaN0kSolH75JNPKCkpqdZ9pqens2XLFtavX897771Xrfu+3smTJ+84\nSej1+mqORjREkiREvREfH89jjz3GuHHjCA8PN0w9AODj48PixYsZO3YsoaGhbNmyxfDcb7/9xrBh\nwxg9ejSLFy/Gx8eHgoICFixYgEajYcKECURERJCbmwvA6dOnmTp1KmFhYbz00ks3jOeXX35hxIgR\njBo1ilmzZpGZmUleXh5Tp06lqKiIiIgIvv7663Jl1q5dy9NPP214rNPp6Nevn2GOsiVLljB+/HhG\njx7NE088QUZGBgAlJSW89dZbjBgxgvDwcGbNmkVWVhYff/wx+/btIyIigoULFwKlsxREREQwatQo\npk2bxvnz54HSO9XDw8N54403mDBhAn/88cfdfByisVBC1DEhISHqzJkz5bZptVoVERGh4uLilFJK\n5ebmqrCwMMPjjh07quXLlyullDp06JDq16+fUkqp9PR0FRAQoBITE5VSSn311VfKx8dH5efnG8oV\nFGMkQHAAAAOxSURBVBQYjvPSSy+pBx98UBUXF6vi4mI1fPhwFRUVVSHG06dPq3vvvVelp6crpZT6\n4IMP1HPPPaeUUiopKUn17t270vdWUFCgevfurS5fvqyUUmr79u1q6tSpSiml1q5dq1599VXDa7//\n/ns1Z84cpZRSH3/8sZo1a5bSarVKKWUov3r1avXMM88YymRkZKjevXur2NhYpZRSK1euVOPGjVNK\nKRUdHa06d+6sjh49WmlsQlRGWhKiXkhISCAuLo7Zs2cTHh7OQw89RElJSbmVCu+77z4AfH19SUtL\no7i4mKNHj9K1a1datmwJwNixYyvsW103M83gwYMxNzfH3Nyczp07k5iYWKFMdHQ0wcHBODk5ATBh\nwgSioqJu+T6srKwYNGgQGzZsAEonYLu6eND27dvZu3cv4eHhhIeH8/3333Px4kWgdKr0KVOmYGpq\nCmCY9fR6R48epVOnTrRr1w6AMWPGcOrUKfLz8wFo3bo13bt3v2WcQlxVp6cKF+IqpRTNmjVjzZo1\nlT6v0WiwtLQEMCw2o9PpKiSA6x9XxsLCwvC3qalppesjK6UqzMV/9bi3Eh4ezptvvsn999/P/v37\neeeddwz7fOKJJxg9enSlx6uKyuK69rGNjU2V9iPEVdKSEPVC27ZtsbKyYu3atYZtcXFx5OXlARW/\nRK8+9vX15cSJE4br8tf2YwA0adKk3ELxVRUUFMTOnTsNfQY//vgjffr0qXD8yvj7+5Obm8t7771H\naGioIbkNHDiQ77//nuzsbACKi4uJiYkBICQkhG+++cbQyX511bkmTZoY+lKuvt9Tp04ZphFfvXo1\nnTt3luQg7pi0JESdo9FoePjhhzEzMzP8Ml6/fj2fffYZCxcu5Msvv0Sn0+Hs7MwHH3xgKHP9PqB0\nAZZ//etfzJgxA0dHR4KDgzEzM8Pa2hqAadOmMWXKFKytrfn222+rHKO3tzezZ8/m4YcfxsTEhJYt\nW7JgwYIKx7+R8PBwPvroo/9v5w5xGASiIAwPBoMhHADNBRCcgtUEzQWQSByChAOgSHB4joVBLqlo\ngnumadK0/T/51LrZyeat1nW9Z2VZ6jgO1XWtIAh0XZeqqlKWZWqaRuM4yjmnMAyVpqmmaVJRFJrn\nWc455Xmurus0DIPatpX3XkmS3E0FeAVfhePnneepKIokPW/W27a9ZRcC+Ac0Cfy8ZVm077u894rj\nWH3ff/pIwNegSQAATDxcAwBMhAQAwERIAABMhAQAwERIAABMhAQAwPQAVSnSA55bZkwAAAAASUVO\nRK5CYII=\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f47b8e3bd90\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(eager_means)\n",
+        "plt.ylabel('Time(s)')\n",
+        "plt.xlabel('Length of vector')\n",
+        "_ = plt.title('Time to sum the elements of 1000 vectors (Eager)')\n",
+        "_ = plt.ylim(ymin=0)"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "Autograph vs. Eager vs Graph sum",
+      "provenance": [
+        {
+          "file_id": "1olZkm32B7n7pQwlIAXR0_w8fZhRHCtkX",
+          "timestamp": 1531755808890
+        }
+      ],
+      "version": "0.3.2",
+      "views": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/autograph/impl/api.py b/tensorflow/contrib/autograph/impl/api.py
index f7fe3de5dabdebe255210a7b6247809dbd70d10b..ee71f4f9acf8b7f00f849dcdbcdc020fed04c278 100644
--- a/tensorflow/contrib/autograph/impl/api.py
+++ b/tensorflow/contrib/autograph/impl/api.py
@@ -23,7 +23,6 @@ from functools import wraps
 from enum import Enum
 
 # pylint:disable=g-bad-import-order
-import gast
 import six
 # pylint:enable=g-bad-import-order
 
@@ -245,19 +244,21 @@ def to_graph(e,
   _, name, namespace = conversion.entity_to_graph(e, program_ctx, arg_values,
                                                   arg_types)
 
-  module = gast.Module([])
+  nodes = []
   for dep in reversed(program_ctx.dependency_cache.values()):
-    module.body.append(dep)
-  compiled_node, compiled_src = compiler.ast_to_object(
-      module, source_prefix=program_ctx.required_imports)
+    nodes.extend(dep)
+  compiled_module, compiled_src = compiler.ast_to_object(
+      nodes,
+      source_prefix=program_ctx.required_imports,
+      include_source_map=True)
 
   # The compiled code should see everything the entry entity saw.
   # TODO(mdan): This might not work well if the call tree spans modules?
   for key, val in namespace.items():
     # Avoid overwriting entities that have been transformed.
-    if key not in compiled_node.__dict__:
-      compiled_node.__dict__[key] = val
-  compiled_fn = getattr(compiled_node, name)
+    if key not in compiled_module.__dict__:
+      compiled_module.__dict__[key] = val
+  compiled_fn = getattr(compiled_module, name)
 
   # Need this so the source_mapping attribute is available for the context
   # manager to access for runtime errors.
@@ -270,7 +271,7 @@ def to_graph(e,
                      '"%s", which is reserved for AutoGraph.' %
                      (compiled_fn, source_map_attribute_name))
   setattr(compiled_fn, source_map_attribute_name,
-          compiled_node.__dict__['ag_source_map__'])
+          compiled_module.__dict__['ag_source_map__'])
 
   if verbose:
     logging.info('Compiled output of %s:\n\n%s\n', e, compiled_src)
@@ -308,7 +309,7 @@ def to_code(e,
   conversion.entity_to_graph(e, program_ctx, arg_values, arg_types)
 
   code = '\n'.join(
-      compiler.ast_to_source(dep, indentation)[0]
+      compiler.ast_to_source(dep, indentation)
       for dep in reversed(tuple(six.itervalues(program_ctx.dependency_cache))))
 
   return program_ctx.required_imports + '\n\n' + code
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 7bd0ba3f2dd2313552f0e184c3301a426f90dce2..57ec739a8006414377de451723538d27e286b2bd 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -164,7 +164,7 @@ def class_to_graph(c, program_ctx):
       class_namespace = namespace
     else:
       class_namespace.update(namespace)
-    converted_members[m] = node
+    converted_members[m] = node[0]
   namer = program_ctx.new_namer(class_namespace)
   class_name = namer.compiled_class_name(c.__name__, c)
 
@@ -175,10 +175,10 @@ def class_to_graph(c, program_ctx):
   # program_ctx.update_name_map(namer)).
   output_nodes = []
   renames = {}
-  bases = []
+  base_names = []
   for base in c.__bases__:
     if isinstance(object, base):
-      bases.append('object')
+      base_names.append('object')
       continue
     if is_whitelisted_for_graph(base):
       alias = namer.new_symbol(base.__name__, ())
@@ -190,28 +190,28 @@ def class_to_graph(c, program_ctx):
     else:
       # This will trigger a conversion into a class with this name.
       alias = namer.compiled_class_name(base.__name__, base)
-    bases.append(alias)
+    base_names.append(alias)
     renames[qual_names.QN(base.__name__)] = qual_names.QN(alias)
   program_ctx.update_name_map(namer)
 
   # Generate the definition of the converted class.
-  output_nodes.append(
-      gast.ClassDef(
-          class_name,
-          bases=bases,
-          keywords=[],
-          body=list(converted_members.values()),
-          decorator_list=[]))
-  node = gast.Module(output_nodes)
-
+  bases = [gast.Name(n, gast.Load(), None) for n in base_names]
+  class_def = gast.ClassDef(
+      class_name,
+      bases=bases,
+      keywords=[],
+      body=list(converted_members.values()),
+      decorator_list=[])
   # Make a final pass to replace references to the class or its base classes.
   # Most commonly, this occurs when making super().__init__() calls.
   # TODO(mdan): Making direct references to superclass' superclass will fail.
-  node = qual_names.resolve(node)
+  class_def = qual_names.resolve(class_def)
   renames[qual_names.QN(c.__name__)] = qual_names.QN(class_name)
-  node = ast_util.rename_symbols(node, renames)
+  class_def = ast_util.rename_symbols(class_def, renames)
+
+  output_nodes.append(class_def)
 
-  return node, class_name, class_namespace
+  return output_nodes, class_name, class_namespace
 
 
 def _add_reserved_symbol(namespace, name, entity):
@@ -279,7 +279,7 @@ def function_to_graph(f,
   program_ctx.update_name_map(namer)
   # TODO(mdan): Use this at compilation.
 
-  return node, new_name, namespace
+  return (node,), new_name, namespace
 
 
 def node_to_graph(node, context, rewrite_errors=True):
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 207225a1acec7db64cc121d7451e4bf34dabc7e7..bfc51365a3031176ed5151e5478b368a9d26aac6 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -60,10 +60,11 @@ class ConversionTest(test.TestCase):
       return a + b
 
     program_ctx = self._simple_program_ctx()
-    ast, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
-    self.assertTrue(isinstance(ast, gast.FunctionDef), ast)
+    nodes, name, ns = conversion.entity_to_graph(f, program_ctx, None, None)
+    fn_node, = nodes
+    self.assertIsInstance(fn_node, gast.FunctionDef)
     self.assertEqual('tf__f', name)
-    self.assertTrue(ns['b'] is b)
+    self.assertIs(ns['b'], b)
 
   def test_entity_to_graph_call_tree(self):
 
@@ -78,14 +79,11 @@ class ConversionTest(test.TestCase):
 
     self.assertTrue(f in program_ctx.dependency_cache)
     self.assertTrue(g in program_ctx.dependency_cache)
-    self.assertEqual('tf__f', program_ctx.dependency_cache[f].name)
-    # need one extra .body[0] in order to step past the try/except wrapper that
-    # is added automatically, the other for the with tf.name_scope('f') that is
-    # added automatically
-    self.assertEqual(
-        'tf__g',
-        program_ctx.dependency_cache[f].body[0].body[0].body[0].value.func.id)
-    self.assertEqual('tf__g', program_ctx.dependency_cache[g].name)
+    f_node = program_ctx.dependency_cache[f][0]
+    g_node = program_ctx.dependency_cache[g][0]
+    self.assertEqual('tf__f', f_node.name)
+    self.assertEqual('tf__g', f_node.body[0].body[0].body[0].value.func.id)
+    self.assertEqual('tf__g', g_node.name)
 
   def test_entity_to_graph_class_hierarchy(self):
 
@@ -118,9 +116,9 @@ class ConversionTest(test.TestCase):
     self.assertTrue(TestBase in program_ctx.dependency_cache)
     self.assertTrue(TestSubclass in program_ctx.dependency_cache)
     self.assertEqual('TfTestBase',
-                     program_ctx.dependency_cache[TestBase].body[-1].name)
+                     program_ctx.dependency_cache[TestBase][-1].name)
     self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass].body[-1].name)
+                     program_ctx.dependency_cache[TestSubclass][-1].name)
 
   def test_entity_to_graph_class_hierarchy_whitelisted(self):
 
@@ -139,10 +137,9 @@ class ConversionTest(test.TestCase):
     self.assertTrue(TestSubclass in program_ctx.dependency_cache)
     self.assertFalse(training.Model in program_ctx.dependency_cache)
     self.assertEqual(
-        'Model',
-        program_ctx.dependency_cache[TestSubclass].body[0].names[0].name)
+        'Model', program_ctx.dependency_cache[TestSubclass][0].names[0].name)
     self.assertEqual('TfTestSubclass',
-                     program_ctx.dependency_cache[TestSubclass].body[-1].name)
+                     program_ctx.dependency_cache[TestSubclass][-1].name)
 
   def test_entity_to_graph_lambda(self):
     f = lambda a: a
diff --git a/tensorflow/contrib/autograph/lang/special_functions.py b/tensorflow/contrib/autograph/lang/special_functions.py
index 11135295a7966bc5d693676fcc71fe43791f2e99..6149cbbd6c9214fb6989bdcae430286445b1db28 100644
--- a/tensorflow/contrib/autograph/lang/special_functions.py
+++ b/tensorflow/contrib/autograph/lang/special_functions.py
@@ -26,6 +26,43 @@ from __future__ import print_function
 from tensorflow.contrib.autograph.operators import data_structures
 
 
+def tensor_list(elements,
+                element_dtype=None,
+                element_shape=None,
+                use_tensor_array=False):
+  """Creates an tensor list and populates it with the given elements.
+
+  This function provides a more uniform access to tensor lists and tensor
+  arrays, and allows optional initialization.
+
+  Note: this function is a simplified wrapper. If you need greater control,
+  it is recommended to use the underlying implementation directly.
+
+  Args:
+    elements: Iterable[tf.Tensor, ...], the elements to initially fill the list
+        with
+    element_dtype: Optional[tf.DType], data type for the elements in the list;
+        required if the list is empty
+    element_shape: Optional[tf.TensorShape], shape for the elements in the list;
+        required if the list is empty
+    use_tensor_array: bool, whether to use the more compatible but restrictive
+        tf.TensorArray implementation
+  Returns:
+    Union[tf.Tensor, tf.TensorArray], the new list.
+  Raises:
+    ValueError: for invalid arguments
+  """
+  if not (elements or (element_dtype and element_shape)):
+    raise ValueError(
+        'element_dtype and element_shape are required for empty lists')
+  if use_tensor_array:
+    return data_structures.tf_tensor_array_new(elements, element_dtype,
+                                               element_shape)
+  else:
+    return data_structures.tf_tensor_list_new(elements, element_dtype,
+                                              element_shape)
+
+
 def stack(list_or_tensor, element_dtype=None, strict=True):
   """Stacks the input, if it admits the notion of stacking.
 
diff --git a/tensorflow/contrib/autograph/lang/special_functions_test.py b/tensorflow/contrib/autograph/lang/special_functions_test.py
index a49cb6407517b634e0f1259fccda03d4ed18e83f..db492cc5c689155bf7d426cbfee320130f4bda9f 100644
--- a/tensorflow/contrib/autograph/lang/special_functions_test.py
+++ b/tensorflow/contrib/autograph/lang/special_functions_test.py
@@ -28,7 +28,23 @@ from tensorflow.python.platform import test
 
 class SpecialFunctionsTest(test.TestCase):
 
-  def test_basic(self):
+  def test_tensor_list_from_elements(self):
+    elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
+
+    l = special_functions.tensor_list(elements)
+    sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+
+  def test_tensor_list_array_from_elements(self):
+    elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
+
+    l = special_functions.tensor_list(elements, use_tensor_array=True)
+    sl = l.stack()
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+
+  def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
     self.assertListEqual(
         special_functions.stack([1, 2, 3], strict=False), [1, 2, 3])
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
index 06d8727b0fcc30b532b3f11281cd1a83c51ac8bc..cc0a3c35448980945f2975f829f9d9421afdb65d 100644
--- a/tensorflow/contrib/autograph/operators/data_structures.py
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import variables
 
 
 # TODO(mdan): Once control flow supports objects, repackage as a class.
@@ -48,29 +47,101 @@ def new_list(iterable=None):
   else:
     elements = ()
 
-  # TODO(mdan): Extend these criteria.
-  if any(isinstance(el, variables.Variable) for el in elements):
+  if elements:
+    # When the list contains elements, it is assumed to be a "Python" lvalue
+    # list.
     return _py_list_new(elements)
-  return _tf_tensor_list_new(elements)
+  return tf_tensor_list_new(elements)
 
 
-def _tf_tensor_list_new(elements):
+def tf_tensor_array_new(elements, element_dtype=None, element_shape=None):
   """Overload of new_list that stages a Tensor list creation."""
   elements = tuple(ops.convert_to_tensor(el) for el in elements)
+
+  all_dtypes = set(el.dtype for el in elements)
+  if len(all_dtypes) == 1:
+    inferred_dtype, = tuple(all_dtypes)
+    if element_dtype is not None and element_dtype != inferred_dtype:
+      raise ValueError(
+          'incompatible dtype; specified: {}, inferred from {}: {}'.format(
+              element_dtype, elements, inferred_dtype))
+  elif len(all_dtypes) > 1:
+    raise ValueError(
+        'TensorArray requires all elements to have the same dtype:'
+        ' {}'.format(elements))
+  else:
+    if element_dtype is None:
+      raise ValueError('dtype is required to create an empty TensorArray')
+
+  all_shapes = set(tuple(el.shape.as_list()) for el in elements)
+  if len(all_shapes) == 1:
+    inferred_shape, = tuple(all_shapes)
+    if element_shape is not None and element_shape != inferred_shape:
+      raise ValueError(
+          'incompatible shape; specified: {}, inferred from {}: {}'.format(
+              element_shape, elements, inferred_shape))
+  elif len(all_shapes) > 1:
+    raise ValueError(
+        'TensorArray requires all elements to have the same shape:'
+        ' {}'.format(elements))
+    # TODO(mdan): We may want to allow different shapes with infer_shape=False.
+  else:
+    inferred_shape = None
+
+  if element_dtype is None:
+    element_dtype = inferred_dtype
+  if element_shape is None:
+    element_shape = inferred_shape
+
+  l = tensor_array_ops.TensorArray(
+      dtype=element_dtype,
+      size=len(elements),
+      dynamic_size=True,
+      infer_shape=(element_shape is None),
+      element_shape=element_shape)
+  for i, el in enumerate(elements):
+    l = l.write(i, el)
+  return l
+
+
+def tf_tensor_list_new(elements, element_dtype=None, element_shape=None):
+  """Overload of new_list that stages a Tensor list creation."""
+  elements = tuple(ops.convert_to_tensor(el) for el in elements)
+
   all_dtypes = set(el.dtype for el in elements)
   if len(all_dtypes) == 1:
-    element_dtype = tuple(all_dtypes)[0]
+    inferred_dtype = tuple(all_dtypes)[0]
+    if element_dtype is not None and element_dtype != inferred_dtype:
+      raise ValueError(
+          'incompatible dtype; specified: {}, inferred from {}: {}'.format(
+              element_dtype, elements, inferred_dtype))
   else:
     # Heterogeneous lists are ok.
-    element_dtype = dtypes.variant
+    if element_dtype is not None:
+      raise ValueError(
+          'specified dtype {} is inconsistent with that of elements {}'.format(
+              element_dtype, elements))
+    inferred_dtype = dtypes.variant
 
-  # TODO(mdan): This may fail for elements of variable shapes.
   all_shapes = set(tuple(el.shape.as_list()) for el in elements)
   if len(all_shapes) == 1:
-    element_shape = array_ops.shape(elements[0])
+    inferred_shape = array_ops.shape(elements[0])
+    if element_shape is not None and element_shape != inferred_shape:
+      raise ValueError(
+          'incompatible shape; specified: {}, inferred from {}: {}'.format(
+              element_shape, elements, inferred_shape))
   else:
     # Heterogeneous lists are ok.
-    element_shape = constant_op.constant(-1)  # unknown shape, by convention
+    if element_shape is not None:
+      raise ValueError(
+          'specified shape {} is inconsistent with that of elements {}'.format(
+              element_shape, elements))
+    inferred_shape = constant_op.constant(-1)  # unknown shape, by convention
+
+  if element_dtype is None:
+    element_dtype = inferred_dtype
+  if element_shape is None:
+    element_shape = inferred_shape
 
   l = list_ops.empty_tensor_list(
       element_shape=element_shape, element_dtype=element_dtype)
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
index 8bbb52d6c10b241ec754c7dea599fa15a869595f..7ea11a839b6070f6c6dfdd8a8f7939923a7d9eaa 100644
--- a/tensorflow/contrib/autograph/operators/data_structures_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -37,10 +37,51 @@ class ListTest(test.TestCase):
 
   def test_new_list_tensor(self):
     l = data_structures.new_list([3, 4, 5])
+    self.assertAllEqual(l, [3, 4, 5])
+
+  def test_tf_tensor_list_new(self):
+    l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.test_session() as sess:
       self.assertAllEqual(sess.run(t), [3, 4, 5])
 
+  def test_tf_tensor_list_new_illegal_input(self):
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4.0])
+    # TODO(mdan): It might make more sense to type cast in this case.
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4], element_dtype=dtypes.float32)
+    # Tensor lists do support heterogeneous lists.
+    self.assertIsNot(data_structures.tf_tensor_list_new([3, [4, 5]]), None)
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([3, 4], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_list_new([], element_dtype=dtypes.float32)
+
+  def test_tf_tensor_array_new(self):
+    l = data_structures.tf_tensor_array_new([3, 4, 5])
+    t = l.stack()
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4, 5])
+
+  def test_tf_tensor_array_new_illegal_input(self):
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4.0])
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4], element_dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, [4, 5]])
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([3, 4], element_shape=(2,))
+    with self.assertRaises(ValueError):
+      data_structures.tf_tensor_array_new([], element_shape=(2,))
+    # TAs can infer the shape.
+    self.assertIsNot(
+        data_structures.tf_tensor_array_new([], element_dtype=dtypes.float32),
+        None)
+
   def test_append_tensor_list(self):
     l = data_structures.new_list()
     x = constant_op.constant([1, 2, 3])
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index f77a6ab3928e4f933f4d21abba2030d4d6f8ec0a..ddadc6b96e8eb5417bfa1676ae304f7cbdedd92b 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -99,6 +99,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "origin_info_test",
+    srcs = ["origin_info_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "parser_test",
     srcs = ["parser_test.py"],
diff --git a/tensorflow/contrib/autograph/pyct/ast_util.py b/tensorflow/contrib/autograph/pyct/ast_util.py
index 86e3f56a64d5300d925bc7fa31eaf69cd5e487a5..d7453b078197cd6f1c0521b861e96dd28d287cab 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import ast
 
-import collections
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
@@ -185,6 +184,7 @@ class PatternMatcher(gast.NodeVisitor):
         if v != p:
           return self.no_match()
 
+
 def matches(node, pattern):
   """Basic pattern matcher for AST.
 
@@ -253,30 +253,61 @@ def apply_to_single_assignments(targets, values, apply_fn):
       apply_fn(target, values)
 
 
-def iter_fields(node):
-  for field in sorted(node._fields):
-    try:
-      yield getattr(node, field)
-    except AttributeError:
-      pass
-
-
-def iter_child_nodes(node):
-  for field in iter_fields(node):
-    if isinstance(field, gast.AST):
-      yield field
-    elif isinstance(field, list):
-      for item in field:
-        if isinstance(item, gast.AST):
-          yield item
-
-
-def parallel_walk(node_a, node_b):
-  todo_a = collections.deque([node_a])
-  todo_b = collections.deque([node_b])
-  while todo_a and todo_b:
-    node_a = todo_a.popleft()
-    node_b = todo_b.popleft()
-    todo_a.extend(iter_child_nodes(node_a))
-    todo_b.extend(iter_child_nodes(node_b))
-    yield node_a, node_b
+def parallel_walk(node, other):
+  """Walks two ASTs in parallel.
+
+  The two trees must have identical structure.
+
+  Args:
+    node: Union[ast.AST, Iterable[ast.AST]]
+    other: Union[ast.AST, Iterable[ast.AST]]
+  Yields:
+    Tuple[ast.AST, ast.AST]
+  Raises:
+    ValueError: if the two trees don't have identical structure.
+  """
+  if isinstance(node, (list, tuple)):
+    node_stack = list(node)
+  else:
+    node_stack = [node]
+
+  if isinstance(other, (list, tuple)):
+    other_stack = list(other)
+  else:
+    other_stack = [other]
+
+  while node_stack and other_stack:
+    assert len(node_stack) == len(other_stack)
+    n = node_stack.pop()
+    o = other_stack.pop()
+
+    if (not isinstance(n, (ast.AST, gast.AST)) or
+        not isinstance(o, (ast.AST, gast.AST)) or
+        n.__class__.__name__ != o.__class__.__name__):
+      raise ValueError('inconsistent nodes: {} and {}'.format(n, o))
+
+    yield n, o
+
+    for f in n._fields:
+      n_child = getattr(n, f, None)
+      o_child = getattr(o, f, None)
+      if f.startswith('__') or n_child is None or o_child is None:
+        continue
+
+      if isinstance(n_child, (list, tuple)):
+        if (not isinstance(o_child, (list, tuple)) or
+            len(n_child) != len(o_child)):
+          raise ValueError(
+              'inconsistent values for field {}: {} and {}'.format(
+                  f, n_child, o_child))
+        node_stack.extend(n_child)
+        other_stack.extend(o_child)
+
+      elif isinstance(n_child, (gast.AST, ast.AST)):
+        node_stack.append(n_child)
+        other_stack.append(o_child)
+
+      elif n_child != o_child:
+        raise ValueError(
+            'inconsistent values for field {}: {} and {}'.format(
+                f, n_child, o_child))
diff --git a/tensorflow/contrib/autograph/pyct/ast_util_test.py b/tensorflow/contrib/autograph/pyct/ast_util_test.py
index 981e398b930e232a563b439659a35376c2995f6c..2293c89720a54f7495670c6f28b00f716cad70db 100644
--- a/tensorflow/contrib/autograph/pyct/ast_util_test.py
+++ b/tensorflow/contrib/autograph/pyct/ast_util_test.py
@@ -44,7 +44,7 @@ class AstUtilTest(test.TestCase):
         node, {qual_names.QN('a'): qual_names.QN('renamed_a')})
 
     self.assertIsInstance(node.body[0].value.left.id, str)
-    source, _ = compiler.ast_to_source(node)
+    source = compiler.ast_to_source(node)
     self.assertEqual(source.strip(), 'renamed_a + b')
 
   def test_rename_symbols_attributes(self):
@@ -54,7 +54,7 @@ class AstUtilTest(test.TestCase):
     node = ast_util.rename_symbols(
         node, {qual_names.from_str('b.c'): qual_names.QN('renamed_b_c')})
 
-    source, _ = compiler.ast_to_source(node)
+    source = compiler.ast_to_source(node)
     self.assertEqual(source.strip(), 'renamed_b_c = renamed_b_c.d')
 
   def test_rename_symbols_annotations(self):
@@ -97,10 +97,10 @@ class AstUtilTest(test.TestCase):
     d = ast_util.keywords_to_dict(keywords)
     # Make sure we generate a usable dict node by attaching it to a variable and
     # compiling everything.
-    output = parser.parse_str('b = 3')
-    output.body += (ast.Assign([ast.Name(id='d', ctx=ast.Store())], d),)
-    result, _ = compiler.ast_to_object(output)
-    self.assertDictEqual(result.d, {'a': 3, 'c': 1, 'd': 'e'})
+    node = parser.parse_str('def f(b): pass').body[0]
+    node.body.append(ast.Return(d))
+    result, _ = compiler.ast_to_object(node)
+    self.assertDictEqual(result.f(3), {'a': 3, 'c': 1, 'd': 'e'})
 
   def assertMatch(self, target_str, pattern_str):
     node = parser.parse_expression(target_str)
@@ -130,8 +130,8 @@ class AstUtilTest(test.TestCase):
                        'super(Bar, _).__init__(_)')
 
   def _mock_apply_fn(self, target, source):
-    target, _ = compiler.ast_to_source(target)
-    source, _ = compiler.ast_to_source(source)
+    target = compiler.ast_to_source(target)
+    source = compiler.ast_to_source(source)
     self._invocation_counts[(target.strip(), source.strip())] += 1
 
   def test_apply_to_single_assignments_dynamic_unpack(self):
@@ -157,24 +157,40 @@ class AstUtilTest(test.TestCase):
     })
 
   def test_parallel_walk(self):
-    ret = ast.Return(
-        ast.BinOp(
-            op=ast.Add(),
-            left=ast.Name(id='a', ctx=ast.Load()),
-            right=ast.Num(1)))
-    node = ast.FunctionDef(
-        name='f',
-        args=ast.arguments(
-            args=[ast.Name(id='a', ctx=ast.Param())],
-            vararg=None,
-            kwarg=None,
-            defaults=[]),
-        body=[ret],
-        decorator_list=[],
-        returns=None)
+    node = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 1
+    """))
     for child_a, child_b in ast_util.parallel_walk(node, node):
       self.assertEqual(child_a, child_b)
 
+  def test_parallel_walk_inconsistent_trees(self):
+    node_1 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 1
+    """))
+    node_2 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + (a * 2)
+    """))
+    node_3 = parser.parse_str(
+        textwrap.dedent("""
+      def f(a):
+        return a + 2
+    """))
+    with self.assertRaises(ValueError):
+      for _ in ast_util.parallel_walk(node_1, node_2):
+        pass
+    # There is not particular reason to reject trees that differ only in the
+    # value of a constant.
+    # TODO(mdan): This should probably be allowed.
+    with self.assertRaises(ValueError):
+      for _ in ast_util.parallel_walk(node_1, node_3):
+        pass
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/cfg.py b/tensorflow/contrib/autograph/pyct/cfg.py
index 25fec7fd532542c53eea4096ea2a13aabfa290db..ba51dcf285036220e01b89e8beeb9aec8ffe36be 100644
--- a/tensorflow/contrib/autograph/pyct/cfg.py
+++ b/tensorflow/contrib/autograph/pyct/cfg.py
@@ -67,10 +67,8 @@ class Node(object):
     if isinstance(self.ast_node, gast.FunctionDef):
       return 'def %s' % self.ast_node.name
     elif isinstance(self.ast_node, gast.withitem):
-      source, _ = compiler.ast_to_source(self.ast_node.context_expr)
-      return source.strip()
-    source, _ = compiler.ast_to_source(self.ast_node)
-    return source.strip()
+      return compiler.ast_to_source(self.ast_node.context_expr).strip()
+    return compiler.ast_to_source(self.ast_node).strip()
 
 
 class Graph(
diff --git a/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py b/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
index 81983a5ecb7b8c6216285409f854e27b7154a08b..aefbc69d8cfb0d9b75e3da10fddee71b2c5e309a 100644
--- a/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
+++ b/tensorflow/contrib/autograph/pyct/common_transformers/anf_test.py
@@ -43,7 +43,7 @@ class AnfTransformerTest(test.TestCase):
       return a
 
     node, _ = parser.parse_entity(test_function)
-    node = anf.transform(node, self._simple_source_info())
+    node = anf.transform(node.body[0], self._simple_source_info())
     result, _ = compiler.ast_to_object(node)
 
     self.assertEqual(test_function(), result.test_function())
diff --git a/tensorflow/contrib/autograph/pyct/compiler.py b/tensorflow/contrib/autograph/pyct/compiler.py
index c172ab21f6966d260b700d1e140f2775bd8bac2a..f9cee109624dafd4da4a0981c5f8fda0a5d8a5e7 100644
--- a/tensorflow/contrib/autograph/pyct/compiler.py
+++ b/tensorflow/contrib/autograph/pyct/compiler.py
@@ -30,66 +30,42 @@ import tempfile
 import astor
 import gast
 
-from tensorflow.contrib.autograph.pyct import anno
-from tensorflow.contrib.autograph.pyct import ast_util
 from tensorflow.contrib.autograph.pyct import origin_info
-from tensorflow.contrib.autograph.pyct import parser
 
 
-def _build_source_map(node, code):
-  """Return the Python objects represented by given AST.
-
-  Compiling the AST code this way ensures that the source code is readable by
-  e.g. `pdb` or `inspect`.
+def ast_to_source(node, indentation='  '):
+  """Return the source code of given AST.
 
   Args:
-    node: An AST node of the original generated code, before the source code is
-      generated.
-    code: The string representation of the source code for the newly generated
-      code.
+    node: The code to compile, as an AST object.
+    indentation: The string to use for indentation.
 
   Returns:
-    Dict[CodeLocation, OriginInfo], a mapping between the user and AutoGraph
-    generated code.
+    code: The source code generated from the AST object
+    source_mapping: A mapping between the user and AutoGraph generated code.
   """
-  # After we have the final generated code we reparse it to get the final line
-  # numbers. Then we walk through the generated and original ASTs in parallel
-  # to build the mapping between the user and generated code.
-  new_node = parser.parse_str(code)
-  origin_info.resolve(new_node, code)
-  source_mapping = {}
-  for before, after in ast_util.parallel_walk(node, new_node):
-    # Need both checks because if origin information is ever copied over to new
-    # nodes then we need to rely on the fact that only the original user code
-    # has the origin annotation.
-    if (anno.hasanno(before, anno.Basic.ORIGIN) and
-        anno.hasanno(after, anno.Basic.ORIGIN)):
-      source_info = anno.getanno(before, anno.Basic.ORIGIN)
-      new_line_number = anno.getanno(after, anno.Basic.ORIGIN).line_number
-      source_mapping[new_line_number] = source_info
-  return source_mapping
-
-
-def ast_to_source(node, indentation='  '):
-  """Return the source code of given AST."""
-  original_node = node
-  if isinstance(node, gast.AST):
-    node = gast.gast_to_ast(node)
+  if not isinstance(node, (list, tuple)):
+    node = (node,)
   generator = astor.codegen.SourceGenerator(indentation, False,
                                             astor.string_repr.pretty_string)
-  generator.visit(node)
-  generator.result.append('\n')
+
+  for n in node:
+    if isinstance(n, gast.AST):
+      n = gast.gast_to_ast(n)
+    generator.visit(n)
+    generator.result.append('\n')
+
   # In some versions of Python, literals may appear as actual values. This
   # ensures everything is string.
   code = map(str, generator.result)
   code = astor.source_repr.pretty_source(code).lstrip()
-  source_mapping = _build_source_map(original_node, code)
 
-  return code, source_mapping
+  return code
 
 
-def ast_to_object(node,
+def ast_to_object(nodes,
                   indentation='  ',
+                  include_source_map=False,
                   source_prefix=None,
                   delete_on_exit=True):
   """Return the Python objects represented by given AST.
@@ -98,41 +74,46 @@ def ast_to_object(node,
   e.g. `pdb` or `inspect`.
 
   Args:
-    node: The code to compile, as an AST object.
-    indentation: The string to use for indentation.
-    source_prefix: Optional string to print as-is into the source file.
-    delete_on_exit: Whether to delete the temporary file used for compilation on
-      exit.
+    nodes: Union[ast.AST, Iterable[ast.AST]], the code to compile, as an AST
+        object.
+    indentation: Text, the string to use for indentation.
+    include_source_map: bool, whether to attach a source map to the compiled
+        object. Also see origin_info.py.
+    source_prefix: Optional[Text], string to print as-is into the source file.
+    delete_on_exit: bool, whether to delete the temporary file used for
+        compilation on exit.
 
   Returns:
-    A module object containing the compiled source code.
+    compiled_nodes: A module object containing the compiled source code.
+    source: The source code of the compiled object
   Raises:
     ValueError: If ag_source_map__ is already in the namespace of the compiled
-    node.
+    nodes.
   """
-  # code_source_mapping does not yet include the offsets from import statements.
-  source, code_source_mapping = ast_to_source(node, indentation=indentation)
+  if not isinstance(nodes, (list, tuple)):
+    nodes = (nodes,)
+
+  source = ast_to_source(nodes, indentation=indentation)
+
+  if source_prefix:
+    source = source_prefix + '\n' + source
 
   with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-    # TODO(znado): move into an _offset_source_map() helper function.
-    # Need to offset the generated line numbers by the number of import lines.
-    if source_prefix:
-      num_import_lines = source_prefix.count('\n') + 1
-    else:
-      num_import_lines = 0
-    source_mapping = {}
-    for line_number, original_position in code_source_mapping.items():
-      source_map_key = origin_info.CodeLocation(
-          file_path=f.name, line_number=line_number + num_import_lines)
-      source_mapping[source_map_key] = original_position
     module_name = os.path.basename(f.name[:-3])
-    if source_prefix:
-      f.write(source_prefix)
-      f.write('\n')
     f.write(source)
+
+    if isinstance(nodes, (list, tuple)):
+      indices = range(-len(nodes), 0)
+    else:
+      indices = (-1,)
+
+    if include_source_map:
+      source_map = origin_info.source_map(nodes, source, f.name, indices)
+
+  # TODO(mdan): Try flush() and delete=False instead.
   if delete_on_exit:
     atexit.register(lambda: os.remove(f.name))
-  compiled_node = imp.load_source(module_name, f.name)
+  compiled_nodes = imp.load_source(module_name, f.name)
 
   # TODO(znado): Clean this up so we don't need to attach it to the namespace.
   # TODO(znado): This does not work for classes because their methods share a
@@ -148,11 +129,13 @@ def ast_to_object(node,
   # is hard, and this cleanly fixes the
   # issues encountered with nested functions because this is attached to the
   # outermost one.
-  source_map_name = 'ag_source_map__'
-  if source_map_name in compiled_node.__dict__:
-    raise ValueError('cannot convert %s because is has namespace attribute '
-                     '"%s", which is reserved for AutoGraph.' %
-                     (compiled_node, source_map_name))
-  compiled_node.__dict__[source_map_name] = source_mapping
-
-  return compiled_node, source
+  if include_source_map:
+    # TODO(mdan): This name should be decided by the caller.
+    source_map_name = 'ag_source_map__'
+    if source_map_name in compiled_nodes.__dict__:
+      raise ValueError('cannot convert %s because is has namespace attribute '
+                       '"%s", which is reserved for AutoGraph.' %
+                       (compiled_nodes, source_map_name))
+    compiled_nodes.__dict__[source_map_name] = source_map
+
+  return compiled_nodes, source
diff --git a/tensorflow/contrib/autograph/pyct/compiler_test.py b/tensorflow/contrib/autograph/pyct/compiler_test.py
index e29fa9324c6b742e8f60c1b6b3302c6fa1374a96..cf783da6a3e540c6901a5fe9a5e4afdb6b1cfc03 100644
--- a/tensorflow/contrib/autograph/pyct/compiler_test.py
+++ b/tensorflow/contrib/autograph/pyct/compiler_test.py
@@ -59,7 +59,7 @@ class CompilerTest(test.TestCase):
                 value=gast.Str('c'))
         ])
 
-    source, _ = compiler.ast_to_source(node, indentation='  ')
+    source = compiler.ast_to_source(node, indentation='  ')
     self.assertEqual(
         textwrap.dedent("""
             if 1:
diff --git a/tensorflow/contrib/autograph/pyct/origin_info.py b/tensorflow/contrib/autograph/pyct/origin_info.py
index 614e346634ddc180ee2364407744537d725eb325..1aad2f47dfa66cc1bfd3a7a66d31b03e2aa0d09e 100644
--- a/tensorflow/contrib/autograph/pyct/origin_info.py
+++ b/tensorflow/contrib/autograph/pyct/origin_info.py
@@ -22,49 +22,115 @@ import collections
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import ast_util
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.python.util import tf_inspect
 
 
-class CodeLocation(
-    collections.namedtuple('CodeLocation', ('file_path', 'line_number'))):
-  """Location of a line of code.
+class LineLocation(
+    collections.namedtuple('LineLocation', ('filename', 'lineno'))):
+  """Similar to Location, but without column information.
 
   Attributes:
-    file_path: text, the full path to the file containing the code.
-    line_number: Int, the 1-based line number of the code in its file.
+    filename: Text
+    lineno: int, 1-based
   """
   pass
 
 
+class Location(
+    collections.namedtuple('Location', ('filename', 'lineno', 'col_offset'))):
+  """Encodes code location information.
+
+  Attributes:
+    filename: Text
+    lineno: int, 1-based
+    col_offset: int
+  """
+
+  @property
+  def line_loc(self):
+    return LineLocation(self.filename, self.lineno)
+
+
 class OriginInfo(
-    collections.namedtuple('OriginInfo',
-                           ('file_path', 'function_name', 'line_number',
-                            'column_offset', 'source_code_line'))):
+    collections.namedtuple(
+        'OriginInfo',
+        ('loc', 'function_name', 'source_code_line'))):
   """Container for information about the source code before conversion.
 
-  Instances of this class contain information about the source code that
-  transformed code originated from. Examples include:
-    * line number
-    * file name
-    * original user code
+  Attributes:
+    loc: Location
+    function_name: Optional[Text]
+    source_code_line: Text
   """
 
   def as_frame(self):
-    """Makes a traceback frame tuple.
-
-    Returns:
-      A tuple of (file_path, line_number, function_name, source_code_line).
-    """
-    return (self.file_path, self.line_number, self.function_name,
+    """Returns a 4-tuple consistent with the return of traceback.extract_tb."""
+    return (self.loc.filename, self.loc.lineno, self.function_name,
             self.source_code_line)
 
 
+# TODO(mdan): This source map should be a class - easier to refer to.
+def source_map(nodes, code, filename, indices_in_code):
+  """Creates a source map between an annotated AST and the code it compiles to.
+
+  Args:
+    nodes: Iterable[ast.AST, ...]
+    code: Text
+    filename: Optional[Text]
+    indices_in_code: Union[int, Iterable[int, ...]], the positions at which
+        nodes appear in code. The parser always returns a module when parsing
+        code. This argument indicates the position in that module's body at
+        which the corresponding of node should appear.
+
+  Returns:
+    Dict[CodeLocation, OriginInfo], mapping locations in code to locations
+    indicated by origin annotations in node.
+  """
+  reparsed_nodes = parser.parse_str(code)
+  reparsed_nodes = [reparsed_nodes.body[i] for i in indices_in_code]
+
+  resolve(reparsed_nodes, code)
+  result = {}
+
+  for before, after in ast_util.parallel_walk(nodes, reparsed_nodes):
+    # Note: generated code might not be mapped back to its origin.
+    # TODO(mdan): Generated code should always be mapped to something.
+    origin_info = anno.getanno(before, anno.Basic.ORIGIN, default=None)
+    final_info = anno.getanno(after, anno.Basic.ORIGIN, default=None)
+    if origin_info is None or final_info is None:
+      continue
+
+    line_loc = LineLocation(filename, final_info.loc.lineno)
+
+    existing_origin = result.get(line_loc)
+    if existing_origin is not None:
+      # Overlaps may exist because of child nodes, but almost never to
+      # different line locations. Exception make decorated functions, where
+      # both lines are mapped to the same line in the AST.
+
+      # Line overlaps: keep bottom node.
+      if existing_origin.loc.line_loc == origin_info.loc.line_loc:
+        if existing_origin.loc.lineno >= origin_info.loc.lineno:
+          continue
+
+      # In case of overlaps, keep the leftmost node.
+      if existing_origin.loc.col_offset <= origin_info.loc.col_offset:
+        continue
+
+    result[line_loc] = origin_info
+
+  return result
+
+
 # TODO(znado): Consider refactoring this into a Visitor.
-def resolve(node, source, function=None):
+# TODO(mdan): Does this work correctly with inner functions?
+def resolve(nodes, source, function=None):
   """Adds an origin information to all nodes inside the body of function.
 
   Args:
-    node: The AST node for the function whose body nodes will be annotated.
+    nodes: Union[ast.AST, Iterable[ast.AST, ...]]
     source: Text, the source code string for the function whose body nodes will
       be annotated.
     function: Callable, the function that will have all nodes inside of it
@@ -76,25 +142,32 @@ def resolve(node, source, function=None):
     A tuple of the AST node for function and a String containing its source
     code.
   """
+  if not isinstance(nodes, (list, tuple)):
+    nodes = (nodes,)
+
   if function:
     _, function_lineno = tf_inspect.getsourcelines(function)
     function_filepath = tf_inspect.getsourcefile(function)
   else:
     function_lineno = None
     function_filepath = None
+
   source_lines = source.split('\n')
-  for n in gast.walk(node):
-    if hasattr(n, 'lineno'):
-      # n.lineno is relative to the start of the enclosing function, so need to
-      # offset it by the line of the function.
-      source_code_line = source_lines[n.lineno - 1]
+  for node in nodes:
+    for n in gast.walk(node):
+      if not hasattr(n, 'lineno'):
+        continue
+
+      lineno_in_body = n.lineno
+
+      source_code_line = source_lines[lineno_in_body - 1]
       if function:
-        source_lineno = n.lineno + function_lineno - 1
+        source_lineno = function_lineno + lineno_in_body
         function_name = function.__name__
       else:
-        source_lineno = n.lineno
+        source_lineno = lineno_in_body
         function_name = None
-      anno.setanno(
-          n, anno.Basic.ORIGIN,
-          OriginInfo(function_filepath, function_name, source_lineno,
-                     n.col_offset, source_code_line))
+
+      location = Location(function_filepath, source_lineno, n.col_offset)
+      origin = OriginInfo(location, function_name, source_code_line)
+      anno.setanno(n, anno.Basic.ORIGIN, origin)
diff --git a/tensorflow/contrib/autograph/pyct/origin_info_test.py b/tensorflow/contrib/autograph/pyct/origin_info_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7d8b1622a2ddb1a1d0eaeec50bdfaf38f05182
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/origin_info_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for origin_info module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import compiler
+from tensorflow.contrib.autograph.pyct import origin_info
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.python.platform import test
+
+
+class OriginInfoTest(test.TestCase):
+
+  def test_source_map(self):
+
+    def test_fn(x):
+      if x > 0:
+        x += 1
+      return x
+
+    node, source = parser.parse_entity(test_fn)
+    fn_node = node.body[0]
+    origin_info.resolve(fn_node, source)
+
+    # Insert a traced line.
+    new_node = parser.parse_str('x = abs(x)').body[0]
+    anno.copyanno(fn_node.body[0], new_node, anno.Basic.ORIGIN)
+    fn_node.body.insert(0, new_node)
+
+    # Insert an untraced line.
+    fn_node.body.insert(0, parser.parse_str('x = 0').body[0])
+
+    modified_source = compiler.ast_to_source(fn_node)
+
+    source_map = origin_info.source_map(fn_node, modified_source,
+                                        'test_filename', [0])
+
+    loc = origin_info.LineLocation('test_filename', 1)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
+    self.assertEqual(origin.loc.lineno, 1)
+
+    # The untraced line, inserted second.
+    loc = origin_info.LineLocation('test_filename', 2)
+    self.assertFalse(loc in source_map)
+
+    # The traced line, inserted first.
+    loc = origin_info.LineLocation('test_filename', 3)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, '  if x > 0:')
+    self.assertEqual(origin.loc.lineno, 2)
+
+    loc = origin_info.LineLocation('test_filename', 4)
+    origin = source_map[loc]
+    self.assertEqual(origin.source_code_line, '  if x > 0:')
+    self.assertEqual(origin.loc.lineno, 2)
+
+  def test_resolve(self):
+
+    def test_fn(x):
+      """Docstring."""
+      return x  # comment
+
+    node, source = parser.parse_entity(test_fn)
+    fn_node = node.body[0]
+    origin_info.resolve(fn_node, source)
+
+    origin = anno.getanno(fn_node, anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 1)
+    self.assertEqual(origin.loc.col_offset, 0)
+    self.assertEqual(origin.source_code_line, 'def test_fn(x):')
+
+    origin = anno.getanno(fn_node.body[0], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 2)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  """Docstring."""')
+
+    origin = anno.getanno(fn_node.body[1], anno.Basic.ORIGIN)
+    self.assertEqual(origin.loc.lineno, 3)
+    self.assertEqual(origin.loc.col_offset, 2)
+    self.assertEqual(origin.source_code_line, '  return x  # comment')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/parser.py b/tensorflow/contrib/autograph/pyct/parser.py
index c961efa892df6a21804dae8f52ef64bf99cd409e..112ed46a1e487a7904e79267c1ce7db0ad914552 100644
--- a/tensorflow/contrib/autograph/pyct/parser.py
+++ b/tensorflow/contrib/autograph/pyct/parser.py
@@ -37,6 +37,7 @@ def parse_entity(entity):
 
 def parse_str(src):
   """Returns the AST of given piece of code."""
+  # TODO(mdan): This should exclude the module things are autowrapped in.
   return gast.parse(src)
 
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
index 32802069ba40aa511e0da2422e235e26a73f9df6..2d8f922a4589e45ab7e4f20f800e0ffef3d7f0a5 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/live_values.py
@@ -91,12 +91,20 @@ class LiveValueResolver(transformer.Base):
     if anno.hasanno(node.value, 'live_val'):
       assert anno.hasanno(node.value, 'fqn')
       parent_object = anno.getanno(node.value, 'live_val')
-      if not hasattr(parent_object, node.attr):
-        raise AttributeError('%s has no attribute %s' % (parent_object,
-                                                         node.attr))
+
       anno.setanno(node, 'parent_type', type(parent_object))
-      anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
       anno.setanno(node, 'fqn', anno.getanno(node.value, 'fqn') + (node.attr,))
+      if hasattr(parent_object, node.attr):
+        # This can happen when the attribute's creation and use depend on the
+        # same static condition, for example:
+        #
+        #  if cond:
+        #    foo.bar = baz
+        #  if cond:
+        #    x = foo.bar
+        #
+        anno.setanno(node, 'live_val', getattr(parent_object, node.attr))
+
     # TODO(mdan): Investigate the role built-in annotations can play here.
     elif anno.hasanno(node.value, 'type'):
       parent_type = anno.getanno(node.value, 'type')
diff --git a/tensorflow/contrib/autograph/pyct/templates.py b/tensorflow/contrib/autograph/pyct/templates.py
index 72d1d3b269f1293fa1e90a40d46e9abf8720bf54..5831d57ceb58d4b291a4f52bbf4282e107104219 100644
--- a/tensorflow/contrib/autograph/pyct/templates.py
+++ b/tensorflow/contrib/autograph/pyct/templates.py
@@ -140,8 +140,8 @@ class ReplaceTransformer(gast.NodeTransformer):
 
   def _set_inner_child_context(self, node, ctx):
     if isinstance(node, gast.Attribute):
-      self._set_inner_child_context(node.value, ctx)
-      node.ctx = gast.Load()
+      self._set_inner_child_context(node.value, gast.Load())
+      node.ctx = ctx
     elif isinstance(node, gast.Tuple):
       for e in node.elts:
         self._set_inner_child_context(e, ctx)
diff --git a/tensorflow/contrib/autograph/pyct/templates_test.py b/tensorflow/contrib/autograph/pyct/templates_test.py
index a8bbc5a4de3bacce10dd0b8b0648bd0cb495e8c9..77e8ff62fd8665e095cfb410a2aa418e9f9bd52b 100644
--- a/tensorflow/contrib/autograph/pyct/templates_test.py
+++ b/tensorflow/contrib/autograph/pyct/templates_test.py
@@ -97,6 +97,19 @@ class TemplatesTest(test.TestCase):
     with self.assertRaises(ValueError):
       templates.replace(template, foo=1)
 
+  def test_replace_attribute_context(self):
+    template = """
+      def test_fn(foo):
+        foo = 0
+    """
+
+    node = templates.replace(
+        template,
+        foo=parser.parse_expression('a.b.c'))[0]
+    self.assertIsInstance(node.body[0].targets[0].ctx, gast.Store)
+    self.assertIsInstance(node.body[0].targets[0].value.ctx, gast.Load)
+    self.assertIsInstance(node.body[0].targets[0].value.value.ctx, gast.Load)
+
   def test_replace_call_keyword(self):
     template = """
       def test_fn():
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index bbdfefc50ab61bc39ba77b91c1db2745712d7790..969ca12244148b346ba3160fba124384a9641a05 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -398,18 +398,36 @@ class Base(gast.NodeTransformer):
     try:
       source, _ = compiler.ast_to_source(node)
       return source
-    except AssertionError:
+    # pylint: disable=broad-except
+    # This function is used for error reporting.  If an exception occurs here,
+    # it should be suppressed, in favor of emitting as informative a message
+    # about the original error as possible.
+    except Exception:
       return '<could not convert AST to source>'
 
   def visit(self, node):
+    if not isinstance(node, gast.AST):
+      # This is not that uncommon a mistake: various node bodies are lists, for
+      # example, posing a land mine for transformers that need to recursively
+      # call `visit`.  The error needs to be raised before the exception handler
+      # below is installed, because said handler will mess up if `node` is not,
+      # in fact, a node.
+      msg = (
+          'invalid value for "node": expected "ast.AST", got "{}"; to'
+          ' visit lists of nodes, use "visit_block" instead').format(type(node))
+      raise ValueError(msg)
+
     source_code = self.entity_info.source_code
     source_file = self.entity_info.source_file
     did_enter_function = False
     local_scope_size_at_entry = len(self._local_scope_state)
+    processing_expr_node = False
 
     try:
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
         did_enter_function = True
+      elif isinstance(node, gast.Expr):
+        processing_expr_node = True
 
       if did_enter_function:
         self._enclosing_entities.append(node)
@@ -418,9 +436,23 @@ class Base(gast.NodeTransformer):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
 
+      if processing_expr_node:
+        entry_expr_value = node.value
+
       if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
         result = super(Base, self).visit(node)
 
+      # Adjust for consistency: replacing the value of an Expr with
+      # an Assign node removes the need for the Expr node.
+      if processing_expr_node:
+        if isinstance(result, gast.Expr) and result.value != entry_expr_value:
+          # When the replacement is a list, it is assumed that the list came
+          # from a template that contained a number of statements, which
+          # themselves are standalone and don't require an enclosing Expr.
+          if isinstance(result.value,
+                        (list, tuple, gast.Assign, gast.AugAssign)):
+            result = result.value
+
       # On exception, the local scope integrity is not guaranteed.
       if did_enter_function:
         self._enclosing_entities.pop()
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
index 19b80b09ac748999cac9e21a60b2da6c2a23faa5..a37e922a1de902106dd3a11f20a14ddde8f6675e 100644
--- a/tensorflow/contrib/autograph/pyct/transformer_test.py
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -282,6 +282,88 @@ class TransformerTest(test.TestCase):
     self.assertTrue(isinstance(node.body[1].body[0], gast.Assign))
     self.assertTrue(isinstance(node.body[1].body[1], gast.Return))
 
+  def test_robust_error_on_list_visit(self):
+
+    class BrokenTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        # This is broken because visit expects a single node, not a list, and
+        # the body of an if is a list.
+        # Importantly, the default error handling in visit also expects a single
+        # node.  Therefore, mistakes like this need to trigger a type error
+        # before the visit called here installs its error handler.
+        # That type error can then be caught by the enclosing call to visit,
+        # and correctly blame the If node.
+        self.visit(node.body)
+        return node
+
+    def test_function(x):
+      if x > 0:
+        return x
+
+    tr = BrokenTransformer(self._simple_source_info())
+
+    node, _ = parser.parse_entity(test_function)
+    with self.assertRaises(transformer.AutographParseError) as cm:
+      node = tr.visit(node)
+    obtained_message = str(cm.exception)
+    expected_message = r'expected "ast.AST", got "\<(type|class) \'list\'\>"'
+    self.assertRegexpMatches(obtained_message, expected_message)
+    # The exception should point at the if statement, not any place else.  Could
+    # also check the stack trace.
+    self.assertTrue(
+        'Occurred at node:\nIf' in obtained_message, obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nFunctionDef' not in obtained_message,
+        obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
+
+  def test_robust_error_on_ast_corruption(self):
+    # A child class should not be able to be so broken that it causes the error
+    # handling in `transformer.Base` to raise an exception.  Why not?  Because
+    # then the original error location is dropped, and an error handler higher
+    # up in the call stack gives misleading information.
+
+    # Here we test that the error handling in `visit` completes, and blames the
+    # correct original exception, even if the AST gets corrupted.
+
+    class NotANode(object):
+      pass
+
+    class BrokenTransformer(transformer.Base):
+
+      def visit_If(self, node):
+        node.body = NotANode()
+        raise ValueError('I blew up')
+
+    def test_function(x):
+      if x > 0:
+        return x
+
+    tr = BrokenTransformer(self._simple_source_info())
+
+    node, _ = parser.parse_entity(test_function)
+    with self.assertRaises(transformer.AutographParseError) as cm:
+      node = tr.visit(node)
+    obtained_message = str(cm.exception)
+    # The message should reference the exception actually raised, not anything
+    # from the exception handler.
+    expected_substring = 'I blew up'
+    self.assertTrue(expected_substring in obtained_message, obtained_message)
+    # Expect the exception to have failed to parse the corrupted AST
+    self.assertTrue(
+        '<could not convert AST to source>' in obtained_message,
+        obtained_message)
+    # The exception should point at the if statement, not any place else.  Could
+    # also check the stack trace.
+    self.assertTrue(
+        'Occurred at node:\nIf' in obtained_message, obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nFunctionDef' not in obtained_message,
+        obtained_message)
+    self.assertTrue(
+        'Occurred at node:\nReturn' not in obtained_message, obtained_message)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 71079cfdc04feaf26ab07b7dba193f745555433f..ccbe5fc9541dfad561d8eab730e2b15f6250ceb2 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -27,6 +27,7 @@ from tensorflow.contrib.autograph.utils import type_check
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
 
@@ -50,7 +51,9 @@ def dynamic_builtin(f, *args, **kwargs):
 
 def dynamic_len(list_or_tensor):
   """Implementation of len using dynamic dispatch."""
-  if tensor_util.is_tensor(list_or_tensor):
+  if _is_tensor_list(list_or_tensor):
+    return list_ops.tensor_list_length(list_or_tensor)
+  elif tensor_util.is_tensor(list_or_tensor):
     shape = list_or_tensor.shape
     if not shape.ndims:
       raise ValueError(
@@ -59,6 +62,11 @@ def dynamic_len(list_or_tensor):
   return len(list_or_tensor)
 
 
+def _is_tensor_list(list_or_tensor):
+  return (tensor_util.is_tensor(list_or_tensor)
+          and list_or_tensor.dtype == dtypes.variant)
+
+
 def dynamic_int(num_or_tensor, **kwargs):
   """Implementation of int() using dynamic dispatch."""
   if tensor_util.is_tensor(num_or_tensor):
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index ef3c60069e8a97f7a13457156d20f3f7a4f7eccb..d7c71a20ed4ba6a55dc0356ab5a3d096ed042e59 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -1,10 +1,344 @@
 # Bigtable #
 
-[Google Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
+[Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
 performance storage system that can store and serve training data. This contrib
 package contains an experimental integration with TensorFlow.
 
 > **Status: Highly experimental.** The current implementation is very much in
 > flux. Please use at your own risk! :-)
 
-<!-- TODO(saeta): Document usage / methods / etc. -->
+The TensorFlow integration with Cloud Bigtable is optimized for common
+TensorFlow usage and workloads. It is currently optimized for reading from Cloud
+Bigtable at high speed, in particular to feed modern accelerators. For
+general-purpose Cloud Bigtable
+APIs, see the [official Cloud Bigtable client library documentation][clientdoc].
+
+[clientdoc]:  https://cloud.google.com/bigtable/docs/reference/libraries
+
+## Sample Use
+
+There are three main reading styles supported by the `BigtableTable` class:
+
+ 1. **Reading keys**: Read only the row keys in a table. Keys are returned in
+    sorted order from the table. Most key reading operations retrieve all keys
+    in a contiguous range, however the `sample_keys` operation skips keys, and
+    operates on the whole table (and not a contiguous subset).
+ 2. **Retrieving a row's values**: Given a row key, look up the data associated
+    with a defined set of columns. This operation takes advantage of Cloud
+    Bigtable's low-latency and excellent support for random access.
+ 3. **Scanning ranges**: Given a contiguous range of rows retrieve both the row
+    key and the data associated with a fixed set of columns. This operation
+    takes advantage of Cloud Bigtable's high throughput scans, and is the most
+    efficient way to read data.
+
+When using the Cloud Bigtable API, the workflow is:
+
+ 1. Create a `BigtableClient` object.
+ 2. Use the `BigtableClient` to create `BigtableTable` objects corresponding to
+    each table in the Cloud Bigtable instance you would like to access.
+ 3. Call methods on the `BigtableTable` object to create `tf.data.Dataset`s to
+    retrieve data.
+
+The following is an example for how to read all row keys with the prefix
+`train-`.
+
+```python
+import tensorflow as tf
+
+GCP_PROJECT_ID = '<FILL_ME_IN>'
+BIGTABLE_INSTANCE_ID = '<FILL_ME_IN>'
+BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
+PREFIX = 'train-'
+
+def main():
+  client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
+  table = client.table(BIGTABLE_TABLE_NAME)
+  dataset = table.keys_by_prefix_dataset(PREFIX)
+  iterator = dataset.make_initializable_iterator()
+  get_next_op = iterator.get_next()
+
+  with tf.Session() as sess:
+    print('Initializing the iterator.')
+    sess.run(iterator.initializer)
+    print('Retrieving rows:')
+    row_index = 0
+    while True:
+      try:
+        row_key = sess.run(get_next_op)
+        print('Row key %d: %s' % (row_index, row_key))
+        row_index += 1
+      except tf.errors.OutOfRangeError:
+        print('Finished reading data!')
+        break
+
+if __name__ == '__main__':
+  main()
+
+```
+
+### Reading row keys
+
+Read only the row keys in a table. Keys are returned in sorted order from the
+table. Most key reading operations retrieve all keys in a contiguous range,
+however the `sample_keys` operation skips keys, and operates on the whole table
+(and not a contiguous subset).
+
+There are 3 methods to retrieve row keys:
+
+ - `table.keys_by_range_dataset(start, end)`: Retrieve row keys starting with
+   `start`, and ending with `end`. The range is "half-open", and thus it
+   includes `start` if `start` is present in the table. It does not include
+   `end`.
+ - `table.keys_by_prefix_dataset(prefix)`: Retrieves all row keys that start
+   with `prefix`. It includes the row key `prefix` if present in the table.
+ - `table.sample_keys()`: Retrieves a sampling of keys from the underlying
+   table. This is often useful in conjunction with parallel scans.
+
+### Reading cell values given a row key
+
+Given a dataset producing row keys, you can use the `table.lookup_columns`
+transformation to retrieve values. Example:
+
+```python
+key_dataset = tf.data.Dataset.from_tensor_slices([
+    'row_key_1',
+    'other_row_key',
+    'final_row_key',
+])
+values_dataset = key_dataset.apply(
+  table.lookup_columns(('my_column_family', 'column_name'),
+                       ('other_cf', 'col')))
+training_data = values_dataset.map(my_parsing_function)  # ...
+```
+
+### Scanning ranges
+Given a contiguous range of rows retrieve both the row key and the data
+associated with a fixed set of columns. Scanning is the most efficient way to
+retrieve data from Cloud Bigtable and is thus a very common API for high
+performance data pipelines. To construct a scanning `tf.data.Dataset` from a
+`BigtableTable` object, call one of the following methods:
+
+ - `table.scan_prefix(prefix, ...)`
+ - `table.scan_range(start, end, ...)`
+ - `table.parallel_scan_prefix(prefix, ...)`
+ - `table.parallel_scan_range(start, end, ...)`
+
+Aside from the specification of the contiguous range of rows, they all take the
+following arguments:
+
+ - `probability`: (Optional.) A float between 0 (exclusive) and 1 (inclusive).
+      A non-1 value indicates to probabilistically sample rows with the
+      provided probability.
+ - `columns`: The columns to read. (See below.)
+ - `**kwargs`: The columns to read. (See below.)
+
+In addition the two parallel operations accept the following optional argument:
+`num_parallel_scans` which configures the number of parallel Cloud Bigtable scan
+operations to run. A reasonable default is automatically chosen for small
+Cloud Bigtable clusters. If you have a large cluster, or an extremely demanding
+workload, you can tune this value to optimize performance.
+
+#### Specifying columns to read when scanning
+
+All of the scan operations allow you to specify the column family and columns
+in the same ways.
+
+##### Using `columns`
+
+The first way to specify the data to read is via the `columns` parameter. The
+value should be a tuple (or list of tuples) of strings. The first string in the
+tuple is the column family, and the second string in the tuple is the column
+qualifier.
+
+##### Using `**kwargs`
+
+The second way to specify the data to read is via the `**kwargs` parameter,
+which you can use to specify keyword arguments corresponding to the columns that
+you want to read. The keyword to use is the column family name, and the argument
+value should be either a string, or a tuple of strings, specifying the column
+qualifiers (column names).
+
+Although using `**kwargs` has the advantage of requiring less typing, it is not
+future-proof in all cases. (If we add a new parameter to the scan functions that
+has the same name as your column family, your code will break.)
+
+##### Examples
+
+Below are two equivalent snippets for how to specify which columns to read:
+
+```python
+ds1 = table.scan_range("row_start", "row_end", columns=[("cfa", "c1"),
+                                                        ("cfa", "c2"),
+                                                        ("cfb", "c3")])
+ds2 = table.scan_range("row_start", "row_end", cfa=["c1", "c2"], cfb="c3")
+```
+
+In this example, we are reading 3 columns from a total of 2 column families.
+From the `cfa` column family, we are reading columns `c1`, and `c2`. From the
+second column family (`cfb`), we are reading `c3`. Both `ds1` and `ds2` will
+output elements of the following types (`tf.string`, `tf.string`, `tf.string`,
+`tf.string`). The first `tf.string` is the row key, the second `tf.string` is
+the latest data in cell `cfa:c1`, the third corresponds to `cfa:c2`, and the
+final one is `cfb:c3`.
+
+#### Determinism when scanning
+
+While the non-parallel scan operations are fully deterministic, the parallel
+scan operations are not. If you would like to scan in parallel without losing
+determinism, you can build up the `parallel_interleave` yourself. As an example,
+say we wanted to scan all rows between `training_data_00000`, and
+`training_data_90000`, we can use the following code snippet:
+
+```python
+table = # ...
+columns = [('cf1', 'col1'), ('cf1', 'col2')]
+NUM_PARALLEL_READS = # ...
+ds = tf.data.Dataset.range(9).shuffle(10)
+def interleave_fn(index):
+  # Given a starting index, create 2 strings to be the start and end
+  start_idx = index
+  end_idx = index + 1
+  start_idx_str = tf.as_string(start_idx * 10000, width=5, fill='0')
+  end_idx_str = tf.as_string(end_idx * 10000, width=5, fill='0')
+  start = tf.string_join(['training_data_', start_idx_str])
+  end = tf.string_join(['training_data_', end_idx_str])
+  return table.scan_range(start_idx, end_idx, columns=columns)
+ds = ds.apply(tf.contrib.data.parallel_interleave(
+    interleave_fn, cycle_length=NUM_PARALLEL_READS, prefetch_input_elements=1))
+```
+
+> Note: you should divide up the key range into more sub-ranges for increased
+> parallelism.
+
+## Writing to Cloud Bigtable
+
+In order to simplify getting started, this package provides basic support for
+writing data into Cloud Bigtable.
+
+> Note: The implementation is not optimized for performance! Please consider
+> using alternative frameworks such as Apache Beam / Cloud Dataflow for
+> production workloads.
+
+Below is an example for how to write a trivial dataset into Cloud Bigtable.
+
+```python
+import tensorflow as tf
+
+GCP_PROJECT_ID = '<FILL_ME_IN>'
+BIGTABLE_INSTANCE_ID = '<FILL_ME_IN>'
+BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
+COLUMN_FAMILY = '<FILL_ME_IN>'
+COLUMN_QUALIFIER = '<FILL_ME_IN>'
+
+def make_dataset():
+  """Makes a dataset to write to Cloud Bigtable."""
+  return tf.data.Dataset.from_tensor_slices([
+      'training_data_1',
+      'training_data_2',
+      'training_data_3',
+  ])
+
+def make_row_key_dataset():
+  """Makes a dataset of strings used for row keys.
+
+  The strings are of the form: `fake-data-` followed by a sequential counter.
+  For example, this dataset would contain the following elements:
+
+   - fake-data-00000001
+   - fake-data-00000002
+   - ...
+   - fake-data-23498103
+  """
+  counter_dataset = tf.contrib.data.Counter()
+  width = 8
+  row_key_prefix = 'fake-data-'
+  ds = counter_dataset.map(lambda index: tf.as_string(index,
+                                                      width=width,
+                                                      fill='0'))
+  ds = ds.map(lambda idx_str: tf.string_join([row_key_prefix, idx_str]))
+  return ds
+
+
+def main():
+  client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
+  table = client.table(BIGTABLE_TABLE_NAME)
+  dataset = make_dataset()
+  index_dataset = make_row_key_dataset()
+  aggregate_dataset = tf.data.Dataset.zip((index_dataset, dataset))
+  write_op = table.write(aggregate_dataset, column_families=[COLUMN_FAMILY],
+                         columns=[COLUMN_QUALIFIER])
+
+  with tf.Session() as sess:
+    print('Starting transfer.')
+    sess.run(write_op)
+    print('Transfer complete.')
+
+if __name__ == '__main__':
+  main()
+```
+
+## Sample applications and architectures
+
+While most machine learning applications are well suited by a high performance
+distributed file system, there are certain applications where using Cloud
+Bigtable works extremely well.
+
+### Perfect Shuffling
+
+Normally, training data is stored in flat files, and a combination of
+(1) `tf.data.Dataset.interleave` (or `parallel_interleave`), (2)
+`tf.data.Dataset.shuffle`, and (3) writing the data in an unsorted order in the
+data files in the first place, provides enough randomization to ensure models
+train efficiently. However, if you would like perfect shuffling, you can use
+Cloud Bigtable's low-latency random access capabilities. Create a
+`tf.data.Dataset` that generates the keys in a perfectly random order (or read
+all the keys into memory and use a shuffle buffer sized to fit all of them for a
+perfect random shuffle using `tf.data.Dataset.shuffle`), and then use
+`lookup_columns` to retrieve the training data.
+
+### Distributed Reinforcement Learning
+
+Sophisticated reinforcement learning algorithms are commonly trained across a
+distributed cluster. (See [IMPALA by DeepMind][impala].) One part of the cluster
+runs self-play, while the other part of the cluster learns a new version of the
+model based on the training data generated by self-play. The new model version
+is then distributed to the self-play half of the cluster, and new training data
+is generated to continue the cycle.
+
+In such a configuration, because there is value in training on the freshest
+examples, a storage service like Cloud Bigtable can be used to store and
+serve the generated training data. When using Cloud Bigtable, there is no need
+to aggregate the examples into large batch files, but the examples can instead
+be written as soon as they are generated, and then retrieved at high speed.
+
+[impala]: https://arxiv.org/abs/1802.01561
+
+## Common Gotchas!
+
+### gRPC Certificates
+
+If you encounter a log line that includes the following:
+
+```
+"description":"Failed to load file", [...],
+"filename":"/usr/share/grpc/roots.pem"
+```
+
+you likely need to copy the [gRPC roots.pem file][grpcPem] to
+`/usr/share/grpc/roots.pem` on your local machine.
+
+[grpcPem]: https://github.com/grpc/grpc/blob/master/etc/roots.pem
+
+### Permission denied errors
+
+The TensorFlow Cloud Bigtable client will search for credentials to use in the
+process's environment. It will use the first credentials it finds if multiple
+are available.
+
+ - **Compute Engine**: When running on Compute Engine, the client will often use
+   the service account from the virtual machine's metadata service. Be sure to
+   authorize your Compute Engine VM to have access to the Cloud Bigtable service
+   when creating your VM.
+ - **Cloud TPU**: Your Cloud TPUs run with the designated Cloud TPU service
+   account dedicated to your GCP project. Ensure the service account has been
+   authorized via the Cloud Console to access your Cloud Bigtable instances.
diff --git a/tensorflow/contrib/bigtable/__init__.py b/tensorflow/contrib/bigtable/__init__.py
index 7df054637cdab32f2dd6201dd3488a90495e1cf5..b7d89c98420ab3ac1465bba718f8257ce2312467 100644
--- a/tensorflow/contrib/bigtable/__init__.py
+++ b/tensorflow/contrib/bigtable/__init__.py
@@ -18,7 +18,7 @@ This contrib package allows TensorFlow to interface directly with Cloud Bigtable
 for high-speed data loading.
 
 @@BigtableClient
-@@BigTable
+@@BigtableTable
 
 """
 
@@ -26,14 +26,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigTable
 from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableClient
+from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableTable
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'BigTable',
     'BigtableClient',
+    'BigtableTable',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
index 70923e6287018c5ecdf8849be99da4ffa68d9bd2..a6755a3496f3e1720f1c8c67f75521f2380a9845 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_kernels.cc
@@ -276,7 +276,7 @@ class ToBigtableOp : public AsyncOpKernel {
         }
         OP_REQUIRES_ASYNC(
             ctx, failures.empty() && mutation_status.ok(),
-            errors::Unknown("Failure while writing to BigTable: ",
+            errors::Unknown("Failure while writing to Cloud Bigtable: ",
                             mutation_status.error_code(), " - ",
                             mutation_status.error_message(), " (",
                             mutation_status.error_details(),
diff --git a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
index 2514575f30831bdcfab87eba07511fd309e8b1c2..67bf14c17646cff81af707405b66c9fba2ded0bd 100644
--- a/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
+++ b/tensorflow/contrib/bigtable/kernels/bigtable_lib.cc
@@ -27,10 +27,10 @@ Status GrpcStatusToTfStatus(const ::grpc::Status& status) {
       status.error_code() == ::grpc::StatusCode::OUT_OF_RANGE) {
     grpc_code = ::grpc::StatusCode::INTERNAL;
   }
-  return Status(
-      static_cast<::tensorflow::error::Code>(status.error_code()),
-      strings::StrCat("Error reading from BigTable: ", status.error_message(),
-                      " (Details: ", status.error_details(), ")"));
+  return Status(static_cast<::tensorflow::error::Code>(status.error_code()),
+                strings::StrCat("Error reading from Cloud Bigtable: ",
+                                status.error_message(),
+                                " (Details: ", status.error_details(), ")"));
 }
 
 string RegexFromStringSet(const std::vector<string>& strs) {
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 2f2006461926c11ea1c150cf6dd8219e776b7dd1..e36f7f32c61b50047c0d9137427f2a24462b1c9a 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -44,7 +44,7 @@ class BigtableOpsTest(test.TestCase):
   def setUp(self):
     self._client = gen_bigtable_test_ops.bigtable_test_client()
     table = gen_bigtable_ops.bigtable_table(self._client, "testtable")
-    self._table = bigtable.BigTable("testtable", None, table)
+    self._table = bigtable.BigtableTable("testtable", None, table)
 
   def _makeSimpleDataset(self):
     output_rows = dataset_ops.Dataset.from_tensor_slices(self.COMMON_ROW_KEYS)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 9f73b7223c64a23bd570ef342fe9cf89bdf8832c..fd30aa8bbb962257c1ef5ac07e047fffca88c4bc 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -94,7 +94,7 @@ class BigtableClient(object):
         project_id, instance_id, connection_pool_size, max_receive_message_size)
 
   def table(self, name, snapshot=None):
-    """Opens a table and returns a `BigTable` object.
+    """Opens a table and returns a `BigtableTable` object.
 
     Args:
       name: A `tf.string` `tf.Tensor` name of the table to open.
@@ -102,19 +102,20 @@ class BigtableClient(object):
         request the creation of a snapshot. (Note: currently unimplemented.)
 
     Returns:
-      A `BigTable` python object representing the operations available on the
-      table.
+      A `BigtableTable` python object representing the operations available on
+      the table.
     """
     # TODO(saeta): Implement snapshot functionality.
     table = gen_bigtable_ops.bigtable_table(self._resource, name)
-    return BigTable(name, snapshot, table)
+    return BigtableTable(name, snapshot, table)
 
 
-class BigTable(object):
-  """BigTable is the entrypoint for reading and writing data in Cloud Bigtable.
+class BigtableTable(object):
+  """BigtableTable is the entrypoint for reading and writing data in Cloud
+  Bigtable.
 
-  This BigTable class is the python representation of the Cloud Bigtable table
-  within TensorFlow. Methods on this class allow data to be read from and
+  This BigtableTable class is the Python representation of the Cloud Bigtable
+  table within TensorFlow. Methods on this class allow data to be read from and
   written to the Cloud Bigtable service in flexible and high performance
   manners.
   """
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index ef0e80cd0997bc0e95cd0d150e87db144a2dde44..f4a375328eb9cdbe17682637c2f20e3aa8a1e0ca 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -147,6 +147,7 @@ py_library(
     deps = [
         ":distillation_loss",
         ":estimator_utils",
+        ":model",
         ":trainer_hooks",
         "//tensorflow/contrib/boosted_trees:gbdt_batch",
         "//tensorflow/contrib/boosted_trees:model_ops_py",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 7eb429b636a5193a124dd9b0c020dae6cac910cb..dbfa69edcbf9e59fedc068b8ee516b92e2c03f4f 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -26,6 +26,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.contrib import layers
+from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.estimator_batch import distillation_loss
 from tensorflow.contrib.boosted_trees.estimator_batch import estimator_utils
 from tensorflow.contrib.boosted_trees.estimator_batch import trainer_hooks
@@ -34,6 +35,7 @@ from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batc
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.python.feature_column import feature_column as feature_column_lib
 from tensorflow.python.framework import ops
@@ -62,27 +64,29 @@ def _add_hidden_layer_summary(value, tag):
   summary.histogram("%s_activation" % tag, value)
 
 
-def _dnn_tree_combined_model_fn(features,
-                                labels,
-                                mode,
-                                head,
-                                dnn_hidden_units,
-                                dnn_feature_columns,
-                                tree_learner_config,
-                                num_trees,
-                                tree_examples_per_layer,
-                                config=None,
-                                dnn_optimizer="Adagrad",
-                                dnn_activation_fn=nn.relu,
-                                dnn_dropout=None,
-                                dnn_input_layer_partitioner=None,
-                                dnn_input_layer_to_tree=True,
-                                dnn_steps_to_train=10000,
-                                predict_with_tree_only=False,
-                                tree_feature_columns=None,
-                                tree_center_bias=False,
-                                dnn_to_tree_distillation_param=None,
-                                use_core_versions=False):
+def _dnn_tree_combined_model_fn(
+    features,
+    labels,
+    mode,
+    head,
+    dnn_hidden_units,
+    dnn_feature_columns,
+    tree_learner_config,
+    num_trees,
+    tree_examples_per_layer,
+    config=None,
+    dnn_optimizer="Adagrad",
+    dnn_activation_fn=nn.relu,
+    dnn_dropout=None,
+    dnn_input_layer_partitioner=None,
+    dnn_input_layer_to_tree=True,
+    dnn_steps_to_train=10000,
+    predict_with_tree_only=False,
+    tree_feature_columns=None,
+    tree_center_bias=False,
+    dnn_to_tree_distillation_param=None,
+    use_core_versions=False,
+    output_type=model.ModelBuilderOutputType.MODEL_FN_OPS):
   """DNN and GBDT combined model_fn.
 
   Args:
@@ -156,6 +160,10 @@ def _dnn_tree_combined_model_fn(features,
       partitioned_variables.min_max_variable_partitioner(
           max_partitions=config.num_ps_replicas, min_slice_size=64 << 20))
 
+  if (output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC and
+      not use_core_versions):
+    raise ValueError("You must use core versions with Estimator Spec")
+
   with variable_scope.variable_scope(
       dnn_parent_scope,
       values=tuple(six.itervalues(features)),
@@ -235,7 +243,8 @@ def _dnn_tree_combined_model_fn(features,
       learner_config=tree_learner_config,
       feature_columns=tree_feature_columns,
       logits_dimension=head.logits_dimension,
-      features=tree_features)
+      features=tree_features,
+      use_core_columns=use_core_versions)
 
   with ops.name_scope("gbdt"):
     predictions_dict = gbdt_model.predict(mode)
@@ -284,63 +293,96 @@ def _dnn_tree_combined_model_fn(features,
     del loss
     return control_flow_ops.no_op()
 
-  if use_core_versions:
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_no_train_op_fn,
-        logits=tree_train_logits)
-    dnn_train_op = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_dnn_train_op_fn,
-        logits=dnn_logits)
-    dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-        dnn_train_op).train_op
+  if tree_center_bias:
+    num_trees += 1
+  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
 
-    tree_train_op = head.create_estimator_spec(
-        features=tree_features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_tree_train_op_fn,
-        logits=tree_train_logits)
-    tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
-        tree_train_op).train_op
+  if output_type == model.ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_versions:
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_no_train_op_fn,
+          logits=tree_train_logits)
+      dnn_train_op = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_dnn_train_op_fn,
+          logits=dnn_logits)
+      dnn_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+          dnn_train_op).train_op
 
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
+      tree_train_op = head.create_estimator_spec(
+          features=tree_features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_tree_train_op_fn,
+          logits=tree_train_logits)
+      tree_train_op = estimator_utils.estimator_spec_to_model_fn_ops(
+          tree_train_op).train_op
+
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_no_train_op_fn,
+          logits=tree_train_logits)
+      dnn_train_op = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_dnn_train_op_fn,
+          logits=dnn_logits).train_op
+      tree_train_op = head.create_model_fn_ops(
+          features=tree_features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_tree_train_op_fn,
+          logits=tree_train_logits).train_op
+
+    # Add the hooks
+    model_fn_ops.training_hooks.extend([
+        trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
+                                    tree_train_op),
+        trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
+                                      finalized_trees)
+    ])
+    return model_fn_ops
+
+  elif output_type == model.ModelBuilderOutputType.ESTIMATOR_SPEC:
+    fusion_spec = head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
         train_op_fn=_no_train_op_fn,
         logits=tree_train_logits)
-    dnn_train_op = head.create_model_fn_ops(
+    dnn_spec = head.create_estimator_spec(
         features=features,
         mode=mode,
         labels=labels,
         train_op_fn=_dnn_train_op_fn,
-        logits=dnn_logits).train_op
-    tree_train_op = head.create_model_fn_ops(
+        logits=dnn_logits)
+    tree_spec = head.create_estimator_spec(
         features=tree_features,
         mode=mode,
         labels=labels,
         train_op_fn=_tree_train_op_fn,
-        logits=tree_train_logits).train_op
-
-  if tree_center_bias:
-    num_trees += 1
-  finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
-
-  model_fn_ops.training_hooks.extend([
-      trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
-                                  tree_train_op),
-      trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees)
-  ])
+        logits=tree_train_logits)
 
-  return model_fn_ops
+    training_hooks = [
+        trainer_hooks.SwitchTrainOp(dnn_spec.train_op, dnn_steps_to_train,
+                                    tree_spec.train_op),
+        trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
+                                      finalized_trees)
+    ]
+    fusion_spec = fusion_spec._replace(training_hooks=training_hooks +
+                                       list(fusion_spec.training_hooks))
+    return fusion_spec
 
 
 class DNNBoostedTreeCombinedClassifier(estimator.Estimator):
@@ -697,3 +739,100 @@ class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+
+class CoreDNNBoostedTreeCombinedEstimator(core_estimator.Estimator):
+  """Initializes a core version of DNNBoostedTreeCombinedEstimator.
+
+    Args:
+      dnn_hidden_units: List of hidden units per layer for DNN.
+      dnn_feature_columns: An iterable containing all the feature columns
+        used by the model's DNN.
+      tree_learner_config: A config for the tree learner.
+      num_trees: Number of trees to grow model to after training DNN.
+      tree_examples_per_layer: Number of examples to accumulate before
+        growing the tree a layer. This value has a big impact on model
+        quality and should be set equal to the number of examples in
+        training dataset if possible. It can also be a function that computes
+        the number of examples based on the depth of the layer that's
+        being built.
+      head: `Head` instance.
+      model_dir: Directory for model exports.
+      config: `RunConfig` of the estimator.
+      dnn_optimizer: string, `Optimizer` object, or callable that defines the
+        optimizer to use for training the DNN. If `None`, will use the Adagrad
+        optimizer with default learning rate.
+      dnn_activation_fn: Activation function applied to each layer of the DNN.
+        If `None`, will use `tf.nn.relu`.
+      dnn_dropout: When not `None`, the probability to drop out a given
+        unit in the DNN.
+      dnn_input_layer_partitioner: Partitioner for input layer of the DNN.
+        Defaults to `min_max_variable_partitioner` with `min_slice_size`
+        64 << 20.
+      dnn_input_layer_to_tree: Whether to provide the DNN's input layer
+      as a feature to the tree.
+      dnn_steps_to_train: Number of steps to train dnn for before switching
+        to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
+      tree_feature_columns: An iterable containing all the feature columns
+        used by the model's boosted trees. If dnn_input_layer_to_tree is
+        set to True, these features are in addition to dnn_feature_columns.
+      tree_center_bias: Whether a separate tree should be created for
+        first fitting the bias.
+      dnn_to_tree_distillation_param: A Tuple of (float, loss_fn), where the
+        float defines the weight of the distillation loss, and the loss_fn, for
+        computing distillation loss, takes dnn_logits, tree_logits and weight
+        tensor. If the entire tuple is None, no distillation will be applied. If
+        only the loss_fn is None, we will take the sigmoid/softmax cross entropy
+        loss be default. When distillation is applied, `predict_with_tree_only`
+        will be set to True.
+    """
+
+  def __init__(self,
+               dnn_hidden_units,
+               dnn_feature_columns,
+               tree_learner_config,
+               num_trees,
+               tree_examples_per_layer,
+               head,
+               model_dir=None,
+               config=None,
+               dnn_optimizer="Adagrad",
+               dnn_activation_fn=nn.relu,
+               dnn_dropout=None,
+               dnn_input_layer_partitioner=None,
+               dnn_input_layer_to_tree=True,
+               dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
+               tree_feature_columns=None,
+               tree_center_bias=False,
+               dnn_to_tree_distillation_param=None):
+
+    def _model_fn(features, labels, mode, config):
+      return _dnn_tree_combined_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          dnn_to_tree_distillation_param=dnn_to_tree_distillation_param,
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC,
+          use_core_versions=True)
+
+    super(CoreDNNBoostedTreeCombinedEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
index 9b7acfa664b0398216b5a7fb904960d8363929d6..839eedd3a87ccaa1faecd1966fe5907d682cac02 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator_test.py
@@ -28,10 +28,11 @@ from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import googletest
-
+from tensorflow.python.training import checkpoint_utils
 
 def _train_input_fn():
   features = {
@@ -156,5 +157,72 @@ class DNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
 
 
+class CoreDNNBoostedTreeCombinedTest(test_util.TensorFlowTestCase):
+
+  def _assert_checkpoint(self, model_dir, global_step):
+    reader = checkpoint_utils.load_checkpoint(model_dir)
+    self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
+
+  def testTrainEvaluateInferDoesNotThrowErrorWithNoDnnInput(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreDNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=False,
+        tree_feature_columns=[core_feature_column.numeric_column("x")])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    # 10 steps for dnn, 3  for 1 tree of depth 3 + 1 after the tree finished
+    self._assert_checkpoint(est.model_dir, global_step=14)
+    res = est.evaluate(input_fn=_eval_input_fn, steps=1)
+    self.assertLess(0.5, res["auc"])
+    est.predict(input_fn=_eval_input_fn)
+
+  def testTrainEvaluateInferDoesNotThrowErrorWithDnnInput(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreDNNBoostedTreeCombinedEstimator(
+        head=head_fn,
+        dnn_hidden_units=[1],
+        dnn_feature_columns=[core_feature_column.numeric_column("x")],
+        tree_learner_config=learner_config,
+        num_trees=1,
+        tree_examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        dnn_steps_to_train=10,
+        dnn_input_layer_to_tree=True,
+        tree_feature_columns=[])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    res = est.evaluate(input_fn=_eval_input_fn, steps=1)
+    self.assertLess(0.5, res["auc"])
+    est.predict(input_fn=_eval_input_fn)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 59a78515c6c1fc98c879e590491433695bdd3445..2df879f924d735c5bcd0d354159c825dee3afda8 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -22,9 +22,16 @@ from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
+from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 
 
+# ================== Old estimator interface===================================
+# The estimators below were designed for old feature columns and old estimator
+# interface. They can be used with new feature columns and losses by setting
+# use_core_libs = True.
+
+
 class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
   """An estimator using gradient boosted decision trees."""
 
@@ -354,3 +361,166 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         model_dir=model_dir,
         config=config,
         feature_engineering_fn=feature_engineering_fn)
+
+# ================== New Estimator interface===================================
+# The estimators below use new core Estimator interface and must be used with
+# new feature columns and heads.
+
+
+class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
+  """An estimator using gradient boosted decision trees.
+
+  Useful for training with user specified `Head`.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               head,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      head: `Head` instance.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+    """
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head': head,
+              'feature_columns': feature_columns,
+              'learner_config': learner_config,
+              'num_trees': num_trees,
+              'weight_column_name': weight_column_name,
+              'examples_per_layer': examples_per_layer,
+              'center_bias': center_bias,
+              'logits_modifier_function': logits_modifier_function,
+              'use_core_libs': True,
+              'output_leaf_index': output_leaf_index,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
+  """A ranking estimator using gradient boosted decision trees."""
+
+  def __init__(
+      self,
+      learner_config,
+      examples_per_layer,
+      head,
+      ranking_model_pair_keys,
+      num_trees=None,
+      feature_columns=None,
+      weight_column_name=None,
+      model_dir=None,
+      config=None,
+      label_keys=None,
+      logits_modifier_function=None,
+      center_bias=False,
+      output_leaf_index=False,
+  ):
+    """Initializes a GradientBoostedDecisionTreeRanker instance.
+
+    This is an estimator that can be trained off the pairwise data and can be
+    used for inference on non-paired data. This is essentially LambdaMart.
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      head: `Head` instance.
+      ranking_model_pair_keys: Keys to distinguish between features
+        for left and right part of the training pairs for ranking. For example,
+        for an Example with features "a.f1" and "b.f1", the keys would be
+        ("a", "b").
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
+        [batch_size, num_trees].
+        For example,
+        result_iter = classifier.predict(...)
+        for result_dict in result_iter:
+          # access leaf index list by result_dict["leaf_index"]
+          # which contains one leaf index per tree
+
+    Raises:
+      ValueError: If learner_config is not valid.
+    """
+
+    def _model_fn(features, labels, mode, config):
+      return model.ranking_model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head': head,
+              'n_classes': 2,
+              'feature_columns': feature_columns,
+              'learner_config': learner_config,
+              'num_trees': num_trees,
+              'weight_column_name': weight_column_name,
+              'examples_per_layer': examples_per_layer,
+              'center_bias': center_bias,
+              'logits_modifier_function': logits_modifier_function,
+              'use_core_libs': True,
+              'output_leaf_index': output_leaf_index,
+              'ranking_model_pair_keys': ranking_model_pair_keys,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 2c2dcb039d98c4793996800e73d7bb9c4d6e6b89..9e9febbbef662a594d3589b501e9ae0eea0af196 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -182,7 +182,7 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     config = run_config.RunConfig()
 
     head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
-        loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
 
     model = estimator.GradientBoostedDecisionTreeRanker(
         head=head_fn,
@@ -203,5 +203,60 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     model.predict(input_fn=_infer_ranking_train_input_fn)
 
 
+class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
+
+  def testTrainEvaluateInferDoesNotThrowError(self):
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    est = estimator.CoreGradientBoostedDecisionTreeEstimator(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[core_feature_column.numeric_column("x")])
+
+    # Train for a few steps.
+    est.train(input_fn=_train_input_fn, steps=1000)
+    est.evaluate(input_fn=_eval_input_fn, steps=1)
+    est.predict(input_fn=_eval_input_fn)
+
+  def testRankingDontThrowExceptionForForEstimator(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    head_fn = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS)
+
+    est = estimator.CoreGradientBoostedDecisionTreeRanker(
+        head=head_fn,
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[
+            core_feature_column.numeric_column("f1"),
+            core_feature_column.numeric_column("f2")
+        ],
+        ranking_model_pair_keys=("a", "b"))
+
+    # Train for a few steps.
+    est.train(input_fn=_ranking_train_input_fn, steps=1000)
+    est.evaluate(input_fn=_ranking_train_input_fn, steps=1)
+    est.predict(input_fn=_infer_ranking_train_input_fn)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index 0e8a56e6e9ec0e4b6f8e3cebd15d72fbf68dad32..161cc42cb0fe93c18722923095edf7228b5b378c 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -29,7 +29,17 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_util
 
-def model_builder(features, labels, mode, params, config):
+class ModelBuilderOutputType(object):
+  MODEL_FN_OPS = 0
+  ESTIMATOR_SPEC = 1
+
+
+def model_builder(features,
+                  labels,
+                  mode,
+                  params,
+                  config,
+                  output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Multi-machine batch gradient descent tree model.
 
   Args:
@@ -49,6 +59,8 @@ def model_builder(features, labels, mode, params, config):
       * center_bias: Whether a separate tree should be created for first fitting
           the bias.
     config: `RunConfig` of the estimator.
+    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
+      (new interface).
 
   Returns:
     A `ModelFnOps` object.
@@ -115,35 +127,63 @@ def model_builder(features, labels, mode, params, config):
         return update_op
 
   create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
-  if use_core_libs and callable(create_estimator_spec_op):
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-  if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
-    model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
-        gbdt_batch.LEAF_INDEX]
+
+  training_hooks = []
   if num_trees:
     if center_bias:
       num_trees += 1
+
     finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
-    model_fn_ops.training_hooks.append(
+    training_hooks.append(
         trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                       finalized_trees))
+
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_libs and callable(create_estimator_spec_op):
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+
+    if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
+      model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
+          gbdt_batch.LEAF_INDEX]
+
+    model_fn_ops.training_hooks.extend(training_hooks)
+    return model_fn_ops
+  elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
+    assert callable(create_estimator_spec_op)
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+    estimator_spec = estimator_spec._replace(
+        training_hooks=training_hooks + list(estimator_spec.training_hooks))
+    return estimator_spec
+
   return model_fn_ops
 
 
-def ranking_model_builder(features, labels, mode, params, config):
+def ranking_model_builder(features,
+                          labels,
+                          mode,
+                          params,
+                          config,
+                          output_type=ModelBuilderOutputType.MODEL_FN_OPS):
   """Multi-machine batch gradient descent tree model for ranking.
 
   Args:
@@ -167,6 +207,9 @@ def ranking_model_builder(features, labels, mode, params, config):
         for an Example with features "a.f1" and "b.f1", the keys would be
         ("a", "b").
     config: `RunConfig` of the estimator.
+    output_type: Whether to return ModelFnOps (old interface) or EstimatorSpec
+      (new interface).
+
 
   Returns:
     A `ModelFnOps` object.
@@ -294,31 +337,54 @@ def ranking_model_builder(features, labels, mode, params, config):
         return update_op
 
   create_estimator_spec_op = getattr(head, "create_estimator_spec", None)
-  if use_core_libs and callable(create_estimator_spec_op):
-    model_fn_ops = head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
-    model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(model_fn_ops)
-  else:
-    model_fn_ops = head.create_model_fn_ops(
-        features=features,
-        mode=mode,
-        labels=labels,
-        train_op_fn=_train_op_fn,
-        logits=logits)
 
-  if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
-    model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
-        gbdt_batch.LEAF_INDEX]
+  training_hooks = []
   if num_trees:
     if center_bias:
       num_trees += 1
+
     finalized_trees, attempted_trees = (
         gbdt_model_main.get_number_of_trees_tensor())
-    model_fn_ops.training_hooks.append(
+    training_hooks.append(
         trainer_hooks.StopAfterNTrees(num_trees, attempted_trees,
                                       finalized_trees))
+
+  if output_type == ModelBuilderOutputType.MODEL_FN_OPS:
+    if use_core_libs and callable(create_estimator_spec_op):
+      model_fn_ops = head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+      model_fn_ops = estimator_utils.estimator_spec_to_model_fn_ops(
+          model_fn_ops)
+    else:
+      model_fn_ops = head.create_model_fn_ops(
+          features=features,
+          mode=mode,
+          labels=labels,
+          train_op_fn=_train_op_fn,
+          logits=logits)
+
+    if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
+      model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
+          gbdt_batch.LEAF_INDEX]
+
+    model_fn_ops.training_hooks.extend(training_hooks)
+    return model_fn_ops
+
+  elif output_type == ModelBuilderOutputType.ESTIMATOR_SPEC:
+    assert callable(create_estimator_spec_op)
+    estimator_spec = head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        train_op_fn=_train_op_fn,
+        logits=logits)
+
+    estimator_spec = estimator_spec._replace(
+        training_hooks=training_hooks + list(estimator_spec.training_hooks))
+    return estimator_spec
+
   return model_fn_ops
diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
index 0b28f81e7ca9a1228adc5bde19c429265e0aa9b8..5b4be2f25838d5405a8148ea20cb0f759cd3a8fb 100644
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@@ -241,6 +241,11 @@ class CreateQuantileAccumulatorOp : public OpKernel {
     // other exceptions. If one already exists, it unrefs the new one.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input(kStampTokenName, &stamp_token_t));
+    // An epsilon value of zero could cause perfoamance issues and is therefore,
+    // disallowed.
+    OP_REQUIRES(
+        context, epsilon_ > 0,
+        errors::InvalidArgument("An epsilon value of zero is not allowed."));
     auto result = new QuantileStreamResource(epsilon_, num_quantiles_,
                                              max_elements_, generate_quantiles_,
                                              stamp_token_t->scalar<int64>()());
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index c120dd8a6c156ec9eb7ba0b6c552f5138bd21a16..f19e5116f5865777ab65e1add2777ac41105acc0 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -58,6 +58,8 @@ namespace quantiles {
 // Compute: O(n * log(1/eps * log(eps * n))).
 // Memory: O(1/eps * log^2(eps * n)) <- for one worker streaming through the
 //                                      entire dataset.
+// An epsilon value of zero would make the algorithm extremely inefficent and
+// therefore, is disallowed.
 template <typename ValueType, typename WeightType,
           typename CompareFn = std::less<ValueType>>
 class WeightedQuantilesStream {
@@ -69,6 +71,9 @@ class WeightedQuantilesStream {
 
   explicit WeightedQuantilesStream(double eps, int64 max_elements)
       : eps_(eps), buffer_(1LL, 2LL), finalized_(false) {
+    // See the class documentation. An epsilon value of zero could cause
+    // perfoamance issues.
+    QCHECK(eps > 0) << "An epsilon value of zero is not allowed.";
     std::tie(max_levels_, block_size_) = GetQuantileSpecs(eps, max_elements);
     buffer_ = Buffer(block_size_, max_elements);
     summary_levels_.reserve(max_levels_);
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 1ee7f2395ea2ad71a7d380a1cc8f9a77bd4782b3..e08b230f468ea2179aa6a8abf5225e9549ea2ee4 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -287,7 +287,8 @@ class GradientBoostedDecisionTreeModel(object):
                loss_reduction=losses.Reduction.SUM_OVER_NONZERO_WEIGHTS,
                feature_columns=None,
                use_core_columns=False,
-               output_leaf_index=False):
+               output_leaf_index=False,
+               output_leaf_index_modes=None):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -307,6 +308,9 @@ class GradientBoostedDecisionTreeModel(object):
         used.
       output_leaf_index: A boolean variable indicating whether to output leaf
         index into predictions dictionary.
+      output_leaf_index_modes: A list of modes from (TRAIN, EVAL, INFER) which
+        dictates when leaf indices will be outputted. By default, leaf indices
+        are only outputted in INFER mode.
 
     Raises:
       ValueError: if inputs are not valid.
@@ -404,7 +408,16 @@ class GradientBoostedDecisionTreeModel(object):
         self._learner_config.multi_class_strategy ==
         learner_pb2.LearnerConfig.TREE_PER_CLASS and
         learner_config.num_classes == 2)
+
+    if output_leaf_index_modes is None:
+      output_leaf_index_modes = [learn.ModeKeys.INFER]
+    elif not all(
+        mode in (learn.ModeKeys.TRAIN, learn.ModeKeys.EVAL,
+                 learn.ModeKeys.INFER) for mode in output_leaf_index_modes):
+      raise ValueError("output_leaf_index_modes should only contain ModeKeys.")
+
     self._output_leaf_index = output_leaf_index
+    self._output_leaf_index_modes = output_leaf_index_modes
 
   def _predict_and_return_dict(self, ensemble_handle, ensemble_stamp, mode):
     """Runs prediction and returns a dictionary of the prediction results.
@@ -435,8 +448,7 @@ class GradientBoostedDecisionTreeModel(object):
     # the right stamp.
     with ops.control_dependencies(ensemble_stats):
       leaf_index = None
-      # Only used in infer (predict), not used in train and eval.
-      if self._output_leaf_index and mode == learn.ModeKeys.INFER:
+      if self._output_leaf_index and mode in self._output_leaf_index_modes:
         predictions, _, leaf_index = (
             prediction_ops).gradient_trees_prediction_verbose(
                 ensemble_handle,
@@ -508,9 +520,6 @@ class GradientBoostedDecisionTreeModel(object):
     if not input_deps:
       raise ValueError("No input tensors for prediction.")
 
-    if any(i.device != input_deps[0].device for i in input_deps):
-      raise ValueError("All input tensors should be on the same device.")
-
     # Get most current model stamp.
     ensemble_stamp = model_ops.tree_ensemble_stamp_token(self._ensemble_handle)
 
diff --git a/tensorflow/contrib/cloud/README.md b/tensorflow/contrib/cloud/README.md
index 134ce057f4334096b4fbbec29cc85f0ea42c9f86..a80d8965f3b562cadaff8caad8d40c7b98afa78f 100644
--- a/tensorflow/contrib/cloud/README.md
+++ b/tensorflow/contrib/cloud/README.md
@@ -1,8 +1,8 @@
 # Cloud #
 
-## BigTable ##
+## Cloud Bigtable ##
 
-[Google Cloud BigTable](https://cloud.google.com/bigtable/) is a high
+[Google Cloud Bigtable](https://cloud.google.com/bigtable/) is a high
 performance storage system that can store and serve training data. This contrib
 package contains an experimental integration with TensorFlow.
 
diff --git a/tensorflow/contrib/cloud/__init__.py b/tensorflow/contrib/cloud/__init__.py
index af81106a6848bfd8c91108b56c8150d47c3eb501..8efd259946b7696e66b83a3b0aa451543c107467 100644
--- a/tensorflow/contrib/cloud/__init__.py
+++ b/tensorflow/contrib/cloud/__init__.py
@@ -25,8 +25,8 @@ from tensorflow.contrib.cloud.python.ops.bigquery_reader_ops import *
 from tensorflow.contrib.cloud.python.ops.gcs_config_ops import *
 
 if os.name != 'nt':
-  from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigTable
   from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableClient
+  from tensorflow.contrib.bigtable.python.ops.bigtable_api import BigtableTable
 
 del os
 
@@ -34,8 +34,8 @@ from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
     'BigQueryReader',
-    'BigTable',
     'BigtableClient',
+    'BigtableTable',
     'BlockCacheParams',
     'configure_colab_session',
     'configure_gcs',
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 8f521ffee4d31e090c13bac98290656d6e1d330e..f9dc3effd075d7e0add07aa77039824031976772 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -259,11 +259,11 @@ class TPUClusterResolver(ClusterResolver):
 
       if 'state' in response and response['state'] != 'READY':
         raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
-                           (self._tpu, response['state']))
+                           (compat.as_text(self._tpu), response['state']))
 
       if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
-                                                            response['health']))
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
+                           (compat.as_text(self._tpu), response['health']))
 
       if 'networkEndpoints' in response:
         worker_list = [
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 708618dcb044905c467726b210dc3455f1d1072b..f6c928e2be62e7292c6feaa3bb26fd463320158b 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -394,16 +394,20 @@ if (tensorflow_ENABLE_GPU)
 
   # by default we assume compute cabability 3.5 and 5.2. If you change this change it in
   # CUDA_NVCC_FLAGS and cuda_config.h below
-  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_30,code=\"sm_30,compute_30\";-gencode arch=compute_35,code=\"sm_35,compute_35\";-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_37,code=\"sm_37,compute_37\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_52,code=\"sm_52,compute_52\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_60,code=\"sm_60,compute_60\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_61,code=\"sm_61,compute_61\")
+  set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_70,code=\"sm_70,compute_70\")
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};--include-path ${PROJECT_BINARY_DIR}/$\{build_configuration\};--expt-relaxed-constexpr)
   set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-ftz=true)  # Flush denormals to zero
   set(CUDA_INCLUDE ${CUDA_TOOLKIT_TARGET_DIR} ${CUDA_TOOLKIT_TARGET_DIR}/extras/CUPTI/include)
   include_directories(${CUDA_INCLUDE})
   if (WIN32)
-    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2)
+    add_definitions(-DGOOGLE_CUDA=1 -DTF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0)
   else (WIN32)
-    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.0, -D3.5, -D5.2" for cc, which incurs build breaks
-    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.0,3.5,5.2")
+    # Without these double quotes, cmake in Linux makes it "-DTF_EXTRA_CUDA_CAPABILITIES=3.7, -D5.2, ..." for cc, which incurs build breaks
+    add_definitions(-DGOOGLE_CUDA=1 -D"TF_EXTRA_CUDA_CAPABILITIES=3.7,5.2,6.0,6.1,7.0")
   endif (WIN32)
 
   if (WIN32)
@@ -452,7 +456,7 @@ if (tensorflow_ENABLE_GPU)
   FILE(WRITE ${tensorflow_source_dir}/third_party/gpus/cuda/cuda_config.h
     "#ifndef CUDA_CUDA_CONFIG_H_\n"
     "#define CUDA_CUDA_CONFIG_H_\n"
-    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.0\"),CudaVersion(\"3.5\"),CudaVersion(\"5.2\")\n"
+    "#define TF_CUDA_CAPABILITIES CudaVersion(\"3.7\"),CudaVersion(\"5.2\"),CudaVersion(\"6.0\"),CudaVersion(\"6.1\"),CudaVersion(\"7.0\")\n"
     "#define TF_CUDA_VERSION \"64_${short_CUDA_VER}\"\n"
     "#define TF_CUDNN_VERSION \"64_${tensorflow_CUDNN_VERSION}\"\n"
     "#define TF_CUDA_TOOLKIT_PATH \"${CUDA_TOOLKIT_ROOT_DIR}\"\n"
@@ -467,7 +471,6 @@ if (tensorflow_ENABLE_GPU)
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cusolverDn.h
-    ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_fp16.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/device_functions.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h
     ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 5931c8a27996534cca80797e8b840559c124297c..6c9ab6aeb87fd39b22ab4f28d69b432b15899a13 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -219,8 +219,10 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
                            op_def)
     #Use Graph's hidden methods to add the op
     to_graph._record_op_seen_by_control_dependencies(new_op)
-    for device_function in reversed(to_graph._device_function_stack):
+    # pylint: disable=protected-access
+    for device_function in to_graph._device_functions_outer_to_inner:
       new_op._set_device(device_function(new_op))
+    # pylint: enable=protected-access
 
     return new_op
 
diff --git a/tensorflow/contrib/crf/__init__.py b/tensorflow/contrib/crf/__init__.py
index 046c509626bc2eb20a65c0b38495ff37c294e0e1..615e62b16f1906dafa22a12cc7275a2335e8df88 100644
--- a/tensorflow/contrib/crf/__init__.py
+++ b/tensorflow/contrib/crf/__init__.py
@@ -20,6 +20,7 @@ See the @{$python/contrib.crf} guide.
 @@crf_decode
 @@crf_log_likelihood
 @@crf_log_norm
+@@crf_multitag_sequence_score
 @@crf_sequence_score
 @@crf_unary_score
 @@CrfDecodeBackwardRnnCell
@@ -36,6 +37,7 @@ from tensorflow.contrib.crf.python.ops.crf import crf_binary_score
 from tensorflow.contrib.crf.python.ops.crf import crf_decode
 from tensorflow.contrib.crf.python.ops.crf import crf_log_likelihood
 from tensorflow.contrib.crf.python.ops.crf import crf_log_norm
+from tensorflow.contrib.crf.python.ops.crf import crf_multitag_sequence_score
 from tensorflow.contrib.crf.python.ops.crf import crf_sequence_score
 from tensorflow.contrib.crf.python.ops.crf import crf_unary_score
 from tensorflow.contrib.crf.python.ops.crf import CrfDecodeBackwardRnnCell
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index 74f2ec22ffaab1654e5cd38169258fb87d307ad4..f56a973f6f80b81697e9f58578e60a2efb90154e 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -31,6 +31,15 @@ from tensorflow.python.platform import test
 
 class CrfTest(test.TestCase):
 
+  def calculateSequenceScore(self, inputs, transition_params, tag_indices,
+                             sequence_lengths):
+    expected_unary_score = sum(
+        inputs[i][tag_indices[i]] for i in range(sequence_lengths))
+    expected_binary_score = sum(
+        transition_params[tag_indices[i], tag_indices[i + 1]]
+        for i in range(sequence_lengths - 1))
+    return expected_unary_score + expected_binary_score
+
   def testCrfSequenceScore(self):
     transition_params = np.array(
         [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
@@ -60,14 +69,55 @@ class CrfTest(test.TestCase):
             transition_params=constant_op.constant(transition_params))
         sequence_score = array_ops.squeeze(sequence_score, [0])
         tf_sequence_score = sess.run(sequence_score)
-        expected_unary_score = sum(inputs[i][tag_indices[i]]
-                                   for i in range(sequence_lengths))
-        expected_binary_score = sum(
-            transition_params[tag_indices[i], tag_indices[i + 1]]
-            for i in range(sequence_lengths - 1))
-        expected_sequence_score = expected_unary_score + expected_binary_score
+        expected_sequence_score = self.calculateSequenceScore(
+            inputs, transition_params, tag_indices, sequence_lengths)
         self.assertAllClose(tf_sequence_score, expected_sequence_score)
 
+  def testCrfMultiTagSequenceScore(self):
+    transition_params = np.array(
+        [[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=np.float32)
+    # Test both the length-1 and regular cases.
+    sequence_lengths_list = [
+        np.array(3, dtype=np.int32),
+        np.array(1, dtype=np.int32)
+    ]
+    inputs_list = [
+        np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]],
+                 dtype=np.float32),
+        np.array([[4, 5, -3]],
+                 dtype=np.float32),
+    ]
+    tag_bitmap_list = [
+        np.array(
+            [[True, True, False], [True, False, True], [False, True, True],
+             [True, False, True]],
+            dtype=np.bool),
+        np.array([[True, True, False]], dtype=np.bool)
+    ]
+    for sequence_lengths, inputs, tag_bitmap in zip(
+        sequence_lengths_list, inputs_list, tag_bitmap_list):
+      with self.test_session() as sess:
+        sequence_score = crf.crf_multitag_sequence_score(
+            inputs=array_ops.expand_dims(inputs, 0),
+            tag_bitmap=array_ops.expand_dims(tag_bitmap, 0),
+            sequence_lengths=array_ops.expand_dims(sequence_lengths, 0),
+            transition_params=constant_op.constant(transition_params))
+        sequence_score = array_ops.squeeze(sequence_score, [0])
+        tf_sum_sequence_score = sess.run(sequence_score)
+        all_indices_list = [
+            single_index_bitmap.nonzero()[0]
+            for single_index_bitmap in tag_bitmap[:sequence_lengths]
+        ]
+        expected_sequence_scores = [
+            self.calculateSequenceScore(inputs, transition_params, indices,
+                                        sequence_lengths)
+            for indices in itertools.product(*all_indices_list)
+        ]
+        expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(
+            expected_sequence_scores)
+        self.assertAllClose(tf_sum_sequence_score,
+                            expected_log_sum_exp_sequence_scores)
+
   def testCrfUnaryScore(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 2d2cbdc1990ed9d8e58c0032cbc141a52271838f..8a7ff61bc8391efe453ee37019c23bd6ccbdf066 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -67,7 +67,7 @@ __all__ = [
     "crf_sequence_score", "crf_log_norm", "crf_log_likelihood",
     "crf_unary_score", "crf_binary_score", "CrfForwardRnnCell",
     "viterbi_decode", "crf_decode", "CrfDecodeForwardRnnCell",
-    "CrfDecodeBackwardRnnCell"
+    "CrfDecodeBackwardRnnCell", "crf_multitag_sequence_score"
 ]
 
 
@@ -114,6 +114,56 @@ def crf_sequence_score(inputs, tag_indices, sequence_lengths,
       false_fn=_multi_seq_fn)
 
 
+def crf_multitag_sequence_score(inputs, tag_bitmap, sequence_lengths,
+                                transition_params):
+  """Computes the unnormalized score of all tag sequences matching tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
+
+  # If max_seq_len is 1, we skip the score calculation and simply gather the
+  # unary potentials of all active tags.
+  def _single_seq_fn():
+    filtered_inputs = array_ops.where(
+        tag_bitmap, inputs,
+        array_ops.fill(array_ops.shape(inputs), float("-inf")))
+    return math_ops.reduce_logsumexp(
+        filtered_inputs, axis=[1, 2], keepdims=False)
+
+  def _multi_seq_fn():
+    # Compute the logsumexp of all scores of sequences matching the given tags.
+    filtered_inputs = array_ops.where(
+        tag_bitmap, inputs,
+        array_ops.fill(array_ops.shape(inputs), float("-inf")))
+    return crf_log_norm(
+        inputs=filtered_inputs,
+        sequence_lengths=sequence_lengths,
+        transition_params=transition_params)
+
+  return utils.smart_cond(
+      pred=math_ops.equal(inputs.shape[1].value or array_ops.shape(inputs)[1],
+                          1),
+      true_fn=_single_seq_fn,
+      false_fn=_multi_seq_fn)
+
+
 def crf_log_norm(inputs, sequence_lengths, transition_params):
   """Computes the normalization for a CRF.
 
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 675330716b2f53edabb61f3ecb37aaed20c4eb90..7878e46e88b2ea8b0012768342c218baeda80eaa 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -52,6 +52,7 @@ See @{$guide/datasets$Importing Data} for an overview.
 @@prefetch_to_device
 @@read_batch_features
 @@rejection_resample
+@@reduce_dataset
 @@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
@@ -77,6 +78,7 @@ from tensorflow.contrib.data.python.ops.counter import Counter
 from tensorflow.contrib.data.python.ops.enumerate_ops import enumerate_dataset
 from tensorflow.contrib.data.python.ops.error_ops import ignore_errors
 from tensorflow.contrib.data.python.ops.get_single_element import get_single_element
+from tensorflow.contrib.data.python.ops.get_single_element import reduce_dataset
 from tensorflow.contrib.data.python.ops.grouping import bucket_by_sequence_length
 from tensorflow.contrib.data.python.ops.grouping import group_by_reducer
 from tensorflow.contrib.data.python.ops.grouping import group_by_window
diff --git a/tensorflow/contrib/data/kernels/prefetching_kernels.cc b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
index b3d464d7165d53cf198072e06214f7d5e982073d..32f03ca68364e40c6fd6769f05d0566f50119240 100644
--- a/tensorflow/contrib/data/kernels/prefetching_kernels.cc
+++ b/tensorflow/contrib/data/kernels/prefetching_kernels.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <deque>
 
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+namespace {
 
 struct BufferElement {
   // The producer sets `status` if getting the input element fails.
@@ -473,4 +475,466 @@ class IteratorGetDeviceOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("IteratorGetDevice").Device(DEVICE_CPU),
                         IteratorGetDeviceOp);
 
+Status VerifyTypesMatch(const DataTypeVector& expected,
+                        const DataTypeVector& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " types but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (expected[i] != received[i]) {
+      return errors::InvalidArgument("Data type mismatch at component ", i,
+                                     ": expected ", DataTypeString(expected[i]),
+                                     " but got ", DataTypeString(received[i]),
+                                     ".");
+    }
+  }
+  return Status::OK();
+}
+
+Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
+                              const std::vector<PartialTensorShape>& received) {
+  if (expected.size() != received.size()) {
+    return errors::InvalidArgument(
+        "Number of components does not match: expected ", expected.size(),
+        " shapes but got ", received.size(), ".");
+  }
+  for (size_t i = 0; i < expected.size(); ++i) {
+    if (!expected[i].IsCompatibleWith(received[i])) {
+      return errors::InvalidArgument("Incompatible shapes at component ", i,
+                                     ": expected ", expected[i].DebugString(),
+                                     " but got ", received[i].DebugString(),
+                                     ".");
+    }
+  }
+
+  return Status::OK();
+}
+
+string SanitizeThreadSuffix(string suffix) {
+  string clean;
+  for (int i = 0; i < suffix.size(); ++i) {
+    const char ch = suffix[i];
+    if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+        (ch >= '0' && ch <= '9') || ch == '_' || ch == '-') {
+      clean += ch;
+    } else {
+      clean += '_';
+    }
+  }
+  return clean;
+}
+
+class MultiDeviceIterator : public ResourceBase {
+ public:
+  MultiDeviceIterator(const DataTypeVector& output_types,
+                      const std::vector<PartialTensorShape>& output_shapes,
+                      const std::vector<string>& devices,
+                      std::unique_ptr<FunctionLibraryDefinition> flib_def,
+                      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+                      FunctionLibraryRuntime* lib)
+      : output_types_(output_types),
+        output_shapes_(output_shapes),
+        devices_(devices),
+        flib_def_(std::move(flib_def)),
+        pflr_(std::move(pflr)),
+        lib_(lib) {
+    buffer_.resize(devices_.size());
+  }
+
+  string DebugString() override {
+    return strings::StrCat("MultiDeviceIterator");
+  }
+
+  Status Init(std::unique_ptr<IteratorBase> iterator, int64* incarnation_id) {
+    mutex_lock l(mu_);
+    if (iterator) {
+      TF_RETURN_IF_ERROR(
+          VerifyTypesMatch(output_types_, iterator->output_dtypes()));
+      TF_RETURN_IF_ERROR(
+          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    }
+    host_iterator_.reset(iterator.release());
+    incarnation_id_++;
+    *incarnation_id = incarnation_id_;
+    max_buffer_size_ = 0;
+    num_elements_ = 0;
+    buffer_.clear();
+    buffer_.resize(devices_.size());
+    return Status::OK();
+  }
+
+  Status GetNextFromShard(IteratorContext* ctx, int shard_num,
+                          int64 incarnation_id,
+                          std::vector<Tensor>* out_tensors,
+                          bool* end_of_sequence) {
+    // TODO(rohanj): This might potentially strand elements in other shards.
+    // Opportunity to do smarter locking semantics.
+    mutex_lock l(mu_);
+    // Make sure we're in the right incarnation.
+    if (incarnation_id != incarnation_id_) {
+      return errors::InvalidArgument(
+          "Current incarnation: ", incarnation_id_,
+          "; Supplied incarnation: ", incarnation_id);
+    }
+    // Then look it up in the buffer.
+    if (!buffer_[shard_num].empty()) {
+      const HostBufferElement& elem = buffer_[shard_num].front();
+      *out_tensors = elem.value;
+      *end_of_sequence = elem.end_of_sequence;
+      Status s = elem.status;
+      buffer_[shard_num].pop_front();
+      return s;
+    }
+    std::shared_ptr<IteratorBase> captured_iterator(host_iterator_);
+    if (captured_iterator) {
+      if (lib_ != nullptr) {
+        ctx->set_lib(lib_);
+      }
+      while (true) {
+        HostBufferElement elem;
+        elem.status =
+            captured_iterator->GetNext(ctx, &elem.value, &elem.end_of_sequence);
+        int buffer_index = num_elements_ % devices_.size();
+        num_elements_++;
+        if (buffer_index == shard_num) {
+          out_tensors->swap(elem.value);
+          *end_of_sequence = elem.end_of_sequence;
+          return elem.status;
+        } else {
+          buffer_[buffer_index].push_back(std::move(elem));
+          // TODO(rohanj): Put an upper bound to buffer size.
+          if (buffer_[buffer_index].size() > max_buffer_size_) {
+            max_buffer_size_ = buffer_[buffer_index].size();
+            VLOG(1) << "MultiDeviceIterator: Max buffer size increased to: "
+                    << max_buffer_size_;
+          }
+        }
+      }
+    } else {
+      return errors::FailedPrecondition("Iterator not initialized");
+    }
+    return Status::OK();
+  }
+
+  const DataTypeVector& output_types() const { return output_types_; }
+
+  const std::vector<PartialTensorShape>& output_shapes() const {
+    return output_shapes_;
+  }
+
+  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
+    tf_shared_lock l(mu_);
+    return lib_def_;
+  }
+
+ private:
+  struct HostBufferElement {
+    Status status;
+    bool end_of_sequence;
+    std::vector<Tensor> value;
+  };
+
+  mutex mu_;
+  const DataTypeVector output_types_;
+  const std::vector<PartialTensorShape> output_shapes_;
+  const std::vector<string> devices_;
+  int64 num_elements_ GUARDED_BY(mu_) = 0;
+  int64 max_buffer_size_ GUARDED_BY(mu_) = 0;
+  int64 incarnation_id_ GUARDED_BY(mu_) = 0;
+  std::vector<std::deque<HostBufferElement>> buffer_ GUARDED_BY(mu_);
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
+  std::shared_ptr<IteratorBase> host_iterator_;
+  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+};
+
+// Just creates a MultiDeviceIterator and returns it.
+class MultiDeviceIteratorHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("devices", &devices_));
+  }
+
+  // The resource is deleted from the resource manager only when it is private
+  // to kernel.
+  ~MultiDeviceIteratorHandleOp() override {
+    if (resource_ != nullptr) {
+      resource_->Unref();
+      if (cinfo_.resource_is_private_to_kernel()) {
+        if (!cinfo_.resource_manager()
+                 ->template Delete<MultiDeviceIterator>(cinfo_.container(),
+                                                        cinfo_.name())
+                 .ok()) {
+          // Do nothing; the resource can have been deleted by session resets.
+        }
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override LOCKS_EXCLUDED(mu_) {
+    {
+      mutex_lock l(mu_);
+      if (resource_ == nullptr) {
+        FunctionLibraryRuntime* lib;
+        std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+        std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+        OP_REQUIRES_OK(context, context->function_library()->Clone(
+                                    &flib_def, &pflr, &lib));
+        ResourceMgr* mgr = context->resource_manager();
+        OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
+
+        MultiDeviceIterator* resource;
+        OP_REQUIRES_OK(
+            context,
+            mgr->LookupOrCreate<MultiDeviceIterator>(
+                cinfo_.container(), cinfo_.name(), &resource,
+                [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret)
+                    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                      *ret = new MultiDeviceIterator(
+                          output_types_, output_shapes_, devices_,
+                          std::move(flib_def), std::move(pflr), lib);
+                      return Status::OK();
+                    }));
+
+        Status s = VerifyResource(resource);
+        if (TF_PREDICT_FALSE(!s.ok())) {
+          resource->Unref();
+          context->SetStatus(s);
+          return;
+        }
+
+        resource_ = resource;
+      }
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, cinfo_.container(), cinfo_.name(),
+                                MakeTypeIndex<MultiDeviceIterator>()));
+  }
+
+ private:
+  // During the first Compute(), resource is either created or looked up using
+  // shared_name. In the latter case, the resource found should be verified if
+  // it is compatible with this op's configuration. The verification may fail in
+  // cases such as two graphs asking queues of the same shared name to have
+  // inconsistent capacities.
+  Status VerifyResource(MultiDeviceIterator* resource) {
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_types_, resource->output_types()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
+    return Status::OK();
+  }
+
+  mutex mu_;
+  ContainerInfo cinfo_;  // Written once under mu_ then constant afterwards.
+  MultiDeviceIterator* resource_ GUARDED_BY(mu_) = nullptr;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+  string name_;
+  string container_;
+  std::vector<string> devices_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("MultiDeviceIterator").Device(DEVICE_CPU),
+                        MultiDeviceIteratorHandleOp);
+
+// Calls init on the MultiDeviceIterator.
+class MultiDeviceIteratorInitOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorInitOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 1), &resource));
+    core::ScopedUnref unref(resource);
+
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    OP_REQUIRES_OK(ctx,
+                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    int64 incarnation_id;
+    OP_REQUIRES_OK(ctx, resource->Init(std::move(iterator), &incarnation_id));
+    Tensor tensor_incarnation_id(DT_INT64, TensorShape({}));
+    tensor_incarnation_id.scalar<int64>()() = incarnation_id;
+    OP_REQUIRES_OK(ctx,
+                   ctx->set_output("incarnation_id", tensor_incarnation_id));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("MultiDeviceIteratorInit").Device(DEVICE_CPU),
+                        MultiDeviceIteratorInitOp);
+
+// Calls GetNextFromShard(shard) and returns a vector of Tensors as output.
+// TODO(rohanj): Implement using BackgroundWorker that Derek built?
+class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel {
+ public:
+  explicit MultiDeviceIteratorGetNextFromShardOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx),
+        thread_pool_(new thread::ThreadPool(
+            ctx->env(), ThreadOptions(),
+            strings::StrCat("multi_device_iterator_get_next_thread_",
+                            SanitizeThreadSuffix(name())),
+            1 /* num_threads */, false /* low_latency_hint */)) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor* tensor_shard_num;
+    OP_REQUIRES_OK_ASYNC(ctx, ctx->input("shard_num", &tensor_shard_num), done);
+    int32 shard_num = tensor_shard_num->scalar<int32>()();
+
+    const Tensor* tensor_incarnation_id;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, ctx->input("incarnation_id", &tensor_incarnation_id), done);
+    int64 incarnation_id = tensor_incarnation_id->scalar<int64>()();
+
+    MultiDeviceIterator* iterator;
+    OP_REQUIRES_OK_ASYNC(
+        ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done);
+    thread_pool_->Schedule(std::bind(
+        [ctx, iterator, shard_num, incarnation_id](DoneCallback done) {
+          std::vector<Tensor> components;
+          bool end_of_sequence = false;
+
+          IteratorContext::Params params;
+          params.env = ctx->env();
+          params.runner = *(ctx->runner());
+          params.function_library = iterator->function_library();
+          DeviceBase* device = ctx->function_library()->device();
+          params.allocator_getter = [device](AllocatorAttributes attrs) {
+            return device->GetAllocator(attrs);
+          };
+          IteratorContext iter_ctx(std::move(params));
+
+          Status s =
+              iterator->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
+                                         &components, &end_of_sequence);
+          iterator->Unref();
+
+          if (!s.ok()) {
+            ctx->SetStatus(s);
+          } else if (end_of_sequence) {
+            ctx->SetStatus(errors::OutOfRange("End of sequence"));
+          } else {
+            for (int i = 0; i < components.size(); ++i) {
+              // TODO(mrry): Check that the shapes match the shape attrs.
+              ctx->set_output(i, components[i]);
+            }
+          }
+          done();
+        },
+        std::move(done)));
+  }
+
+ private:
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorGetNextFromShard").Device(DEVICE_CPU),
+    MultiDeviceIteratorGetNextFromShardOp);
+
+class MultiDeviceIteratorToStringHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorToStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& resource_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(resource_handle_t.shape()),
+                errors::InvalidArgument("resource_handle must be a scalar"));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an MultiDeviceIterator.
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx,
+                   LookupResource(ctx, HandleFromInput(ctx, 0), &resource));
+    resource->Unref();
+
+    Tensor* string_handle_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &string_handle_t));
+    string_handle_t->scalar<string>()() =
+        resource_handle_t.scalar<ResourceHandle>()().SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorToStringHandle").Device(DEVICE_CPU),
+    MultiDeviceIteratorToStringHandleOp);
+
+class MultiDeviceIteratorFromStringHandleOp : public OpKernel {
+ public:
+  explicit MultiDeviceIteratorFromStringHandleOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES(
+        ctx,
+        output_types_.empty() || output_shapes_.empty() ||
+            output_types_.size() == output_shapes_.size(),
+        errors::InvalidArgument("If both 'output_types' and 'output_shapes' "
+                                "are set, they must have the same length."));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& string_handle_t = ctx->input(0);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(string_handle_t.shape()),
+                errors::InvalidArgument("string_handle must be a scalar"));
+
+    ResourceHandle resource_handle;
+    OP_REQUIRES(
+        ctx,
+        resource_handle.ParseFromString(string_handle_t.scalar<string>()()),
+        errors::InvalidArgument(
+            "Could not parse string_handle as a valid ResourceHandle"));
+
+    OP_REQUIRES(
+        ctx, resource_handle.device() == ctx->device()->attributes().name(),
+        errors::InvalidArgument("Attempted create an iterator on device \"",
+                                ctx->device()->attributes().name(),
+                                "\" from handle defined on device \"",
+                                resource_handle.device(), "\""));
+
+    // Validate that the handle corresponds to a real resource, and
+    // that it is an MultiDeviceIterator.
+    MultiDeviceIterator* resource;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, resource_handle, &resource));
+    core::ScopedUnref unref_iterator(resource);
+    if (!output_types_.empty()) {
+      OP_REQUIRES_OK(ctx,
+                     VerifyTypesMatch(output_types_, resource->output_types()));
+    }
+    if (!output_shapes_.empty()) {
+      OP_REQUIRES_OK(ctx, VerifyShapesCompatible(output_shapes_,
+                                                 resource->output_shapes()));
+    }
+
+    Tensor* resource_handle_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({}), &resource_handle_t));
+    resource_handle_t->scalar<ResourceHandle>()() = resource_handle;
+  }
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("MultiDeviceIteratorFromStringHandle").Device(DEVICE_CPU),
+    MultiDeviceIteratorFromStringHandleOp);
+
+}  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index b5c6f2e241f948ba300a0ab3418cb68f6236e70e..66a7c7fdcd5e0ab77596177c209470e17f63bc10 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -145,6 +145,80 @@ Resets the FunctionBufferingResource.
 function_buffer_resource: The FunctionBufferingResource handle.
 )doc");
 
+REGISTER_OP("MultiDeviceIterator")
+    .Output("handle: resource")
+    .Attr("devices: list(string) >= 1")
+    .Attr("shared_name: string")
+    .Attr("container: string")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Doc(R"doc(
+Creates a MultiDeviceIterator resource.
+
+handle: Handle to the resource created.
+devices: A list of devices the iterator works across.
+shared_name: If non-empty, this resource will be shared under the given name
+  across multiple sessions.
+container: If non-empty, this resource is placed in the given container.
+  Otherwise, a default container is used.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorInit")
+    .Input("dataset: variant")
+    .Input("multi_device_iterator: resource")
+    .Output("incarnation_id: int64")
+    .Doc(R"doc(
+Initializes the multi device iterator with the given dataset.
+incarnation_id: An int64 indicating which incarnation of the MultiDeviceIterator
+  is running.
+dataset: Dataset to be iterated upon.
+multi_device_iterator: A MultiDeviceIteratorResource.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorGetNextFromShard")
+    .Input("multi_device_iterator: resource")
+    .Input("shard_num: int32")
+    .Input("incarnation_id: int64")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Doc(R"doc(
+Gets next element for the provided shard number.
+
+multi_device_iterator: A MultiDeviceIterator resource.
+shard_num: Integer representing which shard to fetch data for.
+incarnation_id: Which incarnation of the MultiDeviceIterator is running.
+components: Result of the get_next on the dataset.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorToStringHandle")
+    .Input("multi_device_iterator: resource")
+    .Output("string_handle: string")
+    .Doc(R"doc(
+Produces a string handle for the given MultiDeviceIterator.
+
+multi_device_iterator: A MultiDeviceIterator resource.
+string_handle: A string representing the resource.
+)doc");
+
+REGISTER_OP("MultiDeviceIteratorFromStringHandle")
+    .Input("string_handle: string")
+    .Output("multi_device_iterator: resource")
+    .Attr("output_types: list(type) >= 0 = []")
+    .Attr("output_shapes: list(shape) >= 0 = []")
+    .Doc(R"doc(
+Generates a MultiDeviceIterator resource from its provided string handle.
+
+string_handle: String representing the resource.
+multi_device_iterator: A MultiDeviceIterator resource.
+output_types: The type list for the return values.
+output_shapes: The list of shapes being produced.
+)doc");
+
 REGISTER_OP("ThreadPoolDataset")
     .Input("input_dataset: variant")
     .Input("thread_pool: resource")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index f805027727c935511819d4580c1aa3686f1c80c6..2de1a79d28c16706e3c237d62935212ce387c776 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -121,6 +121,7 @@ py_test(
     srcs = ["get_single_element_test.py"],
     deps = [
         "//tensorflow/contrib/data/python/ops:get_single_element",
+        "//tensorflow/contrib/data/python/ops:grouping",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -128,6 +129,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -190,6 +192,7 @@ py_test(
     deps = [
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/data/python/ops:error_ops",
+        "//tensorflow/contrib/data/python/ops:optimization",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
@@ -211,6 +214,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -232,7 +236,12 @@ cuda_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
-    tags = ["no_windows_gpu"],
+    tags = [
+        "manual",
+        "no_oss",
+        "no_windows_gpu" +
+        "notap",
+    ],
 )
 
 py_test(
diff --git a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
index 87b7c6ddb7afcbaaf8fe97cd8be87e6f5af8cd4d..e6883d53e02c0f96d966a52abfe2f9b4118f2e12 100644
--- a/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/get_single_element_test.py
@@ -17,9 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+import numpy as np
+
 from tensorflow.contrib.data.python.ops import get_single_element
+from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
@@ -27,40 +30,69 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class GetSingleElementTest(test.TestCase):
+class GetSingleElementTest(test.TestCase, parameterized.TestCase):
 
-  def testGetSingleElement(self):
-    skip_value = array_ops.placeholder(dtypes.int64, shape=[])
-    take_value = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtype=dtypes.int64), shape=[])
+  @parameterized.named_parameters(
+      ("Zero", 0, 1),
+      ("Five", 5, 1),
+      ("Ten", 10, 1),
+      ("Empty", 100, 1, errors.InvalidArgumentError, "Dataset was empty."),
+      ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
+       "Dataset had more than one element."),
+  )
+  def testGetSingleElement(self, skip, take, error=None, error_msg=None):
+    skip_t = array_ops.placeholder(dtypes.int64, shape=[])
+    take_t = array_ops.placeholder(dtypes.int64, shape=[])
 
     def make_sparse(x):
       x_1d = array_ops.reshape(x, [1])
       x_2d = array_ops.reshape(x, [1, 1])
       return sparse_tensor.SparseTensor(x_2d, x_1d, x_1d)
 
-    dataset = (dataset_ops.Dataset.range(100)
-               .skip(skip_value)
-               .map(lambda x: (x * x, make_sparse(x)))
-               .take(take_value))
-
+    dataset = dataset_ops.Dataset.range(100).skip(skip_t).map(
+        lambda x: (x * x, make_sparse(x))).take(take_t)
     element = get_single_element.get_single_element(dataset)
 
     with self.test_session() as sess:
-      for x in [0, 5, 10]:
-        dense_val, sparse_val = sess.run(element, feed_dict={skip_value: x})
-        self.assertEqual(x * x, dense_val)
-        self.assertAllEqual([[x]], sparse_val.indices)
-        self.assertAllEqual([x], sparse_val.values)
-        self.assertAllEqual([x], sparse_val.dense_shape)
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset was empty."):
-        sess.run(element, feed_dict={skip_value: 100})
-
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Dataset had more than one element."):
-        sess.run(element, feed_dict={skip_value: 0, take_value: 2})
+      if error is None:
+        dense_val, sparse_val = sess.run(
+            element, feed_dict={
+                skip_t: skip,
+                take_t: take
+            })
+        self.assertEqual(skip * skip, dense_val)
+        self.assertAllEqual([[skip]], sparse_val.indices)
+        self.assertAllEqual([skip], sparse_val.values)
+        self.assertAllEqual([skip], sparse_val.dense_shape)
+      else:
+        with self.assertRaisesRegexp(error, error_msg):
+          sess.run(element, feed_dict={skip_t: skip, take_t: take})
+
+  @parameterized.named_parameters(
+      ("SumZero", 0),
+      ("SumOne", 1),
+      ("SumFive", 5),
+      ("SumTen", 10),
+  )
+  def testReduceDataset(self, stop):
+    def init_fn(_):
+      return np.int64(0)
+
+    def reduce_fn(state, value):
+      return state + value
+
+    def finalize_fn(state):
+      return state
+
+    sum_reducer = grouping.Reducer(init_fn, reduce_fn, finalize_fn)
+
+    stop_t = array_ops.placeholder(dtypes.int64, shape=[])
+    dataset = dataset_ops.Dataset.range(stop_t)
+    element = get_single_element.reduce_dataset(dataset, sum_reducer)
+
+    with self.test_session() as sess:
+      value = sess.run(element, feed_dict={stop_t: stop})
+      self.assertEqual(stop * (stop - 1) / 2, value)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
index b7025f3802c0c280981df20c86747e49fdf2274f..48adc98e9a4caee1651d5c7bca9dd813f11dfb01 100644
--- a/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/map_dataset_op_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
@@ -82,15 +83,17 @@ class MapDatasetTest(test.TestCase):
     def write_string_to_file(value, filename):
       with open(filename, "w") as f:
         f.write(value)
-    filenames = [os.path.join(self.get_temp_dir(), "file_%d.txt" % i)
-                 for i in range(5)]
+
+    filenames = [
+        os.path.join(self.get_temp_dir(), "file_%d.txt" % i) for i in range(5)
+    ]
     for filename in filenames:
       write_string_to_file(filename, filename)
 
     dataset = (
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
-            io_ops.read_file, num_parallel_calls=2).prefetch(2).apply(
-                error_ops.ignore_errors()))
+            io_ops.read_file,
+            num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
     iterator = dataset.make_initializable_iterator()
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -264,5 +267,46 @@ class MapDatasetBenchmark(test.Benchmark):
     benchmark("Transformation parallelism evaluation", par_num_calls_series)
     benchmark("Threadpool size evaluation", par_inter_op_series)
 
+  # This benchmark compares the performance of pipeline with multiple chained
+  # maps with and without map fusion.
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkChainOfMaps(chain_length, False)
+      self._benchmarkChainOfMaps(chain_length, True)
+
+  def _benchmarkChainOfMaps(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x)
+      if optimize_dataset:
+        dataset = dataset.apply(optimization.optimize(["map_fusion"]))
+
+      iterator = dataset.make_one_shot_iterator()
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "no-opt"
+        print("Map dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=1000,
+            wall_time=median_wall_time,
+            name="benchmark_map_dataset_chain_latency_{}_{}".format(
+                opt_mark, chain_length))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
index cfef40e1923607406d6587466dc5f533499220eb..d8156dc9c7bf187d7399aede44c41c8c50670248 100644
--- a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
@@ -17,13 +17,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.contrib.data.python.ops import optimization
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 
-class OptimizeDatasetTest(test.TestCase):
+class OptimizeDatasetTest(test.TestCase, parameterized.TestCase):
 
   def testAssertSuffix(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
@@ -110,6 +112,53 @@ class OptimizeDatasetTest(test.TestCase):
                                    "Function .* is not defined."):
         sess.run(get_next)
 
+  @staticmethod
+  def map_functions():
+    identity = lambda x: x
+    increment = lambda x: x + 1
+
+    def increment_and_square(x):
+      y = x + 1
+      return y * y
+
+    functions = [identity, increment, increment_and_square]
+    tests = []
+
+    for fun1 in functions:
+      for fun2 in functions:
+        tests.append(([fun1, fun2],))
+        for fun3 in functions:
+          tests.append(([fun1, fun2, fun3],))
+
+    swap = lambda x, n: (n, x)
+    tests.append(([lambda x: (x, 42), swap],))
+    tests.append(([lambda x: (x, 42), swap, swap],))
+    return tuple(tests)
+
+  @parameterized.parameters(*map_functions.__func__())
+  def testMapFusion(self, functions):
+    dataset = dataset_ops.Dataset.range(5).apply(
+        optimization.assert_next(["Map", "Prefetch"]))
+    for function in functions:
+      dataset = dataset.map(function)
+
+    dataset = dataset.prefetch(0).apply(optimization.optimize(["map_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+    with self.test_session() as sess:
+      for x in range(5):
+        result = sess.run(get_next)
+        r = x
+        for function in functions:
+          if isinstance(r, tuple):
+            r = function(*r)  # Pass tuple as multiple arguments.
+          else:
+            r = function(r)
+        self.assertAllEqual(r, result)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
index 82543b10395116a273954bd71bd5e5fde6679585..2da6131e8e60ca53723da7f66a7ee52151640129 100644
--- a/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/prefetching_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 
@@ -907,5 +908,155 @@ class CopyToDeviceTest(test.TestCase):
         sess.run(next_element)
 
 
+class MultiDeviceIteratorTest(test.TestCase):
+
+  def testBasic(self):
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testOneOnSameDevice(self):
+    with ops.device("/cpu:0"):
+      dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:0", "/cpu:1"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 2})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testRepeatDevices(self):
+    with ops.device("/cpu:0"):
+      dataset = dataset_ops.Dataset.range(20)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2", "/cpu:1", "/cpu:2"])
+    elements = multi_device_iterator.get_next()
+    elem_on_1, elem_on_2, elem_on_3, elem_on_4 = elements
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 20, 4):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 2, sess.run(elem_on_3))
+        self.assertEqual(i + 3, sess.run(elem_on_4))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+        sess.run(elem_on_3)
+        sess.run(elem_on_4)
+
+  def testNotFullyDivisible(self):
+    dataset = dataset_ops.Dataset.range(9)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 8, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      self.assertEqual(8, sess.run(elem_on_1))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testUneven(self):
+    dataset = dataset_ops.Dataset.range(10)
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"])
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      sess.run(multi_device_iterator.initializer)
+      for i in range(0, 10, 2):
+        self.assertEqual(i, sess.run(elem_on_1))
+      for i in range(0, 10, 2):
+        self.assertEqual(i + 1, sess.run(elem_on_2))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elem_on_1)
+        sess.run(elem_on_2)
+
+  def testMultipleInitializations(self):
+    with ops.device("/cpu:0"):
+      epoch = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset1 = dataset_ops.Dataset.from_tensors(epoch).repeat(1000)
+      dataset2 = dataset_ops.Dataset.range(1000)
+      dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+        dataset, ["/cpu:1", "/cpu:2"], prefetch_buffer_size=4)
+    elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+    init_op = multi_device_iterator.initializer
+
+    config = config_pb2.ConfigProto(device_count={"CPU": 3})
+    with self.test_session(config=config) as sess:
+      for i in range(1000):
+        sess.run(init_op, feed_dict={epoch: i})
+        self.assertEqual([(i, 0), (i, 1)], sess.run([elem_on_1, elem_on_2]))
+
+  def testBasicGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with compat.forward_compatibility_horizon(2018, 8, 4):
+      dataset = dataset_ops.Dataset.range(10)
+      multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/gpu:0"])
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+      config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
+      with self.test_session(config=config) as sess:
+        sess.run(multi_device_iterator.initializer)
+        for i in range(0, 10, 2):
+          self.assertEqual(i, sess.run(elem_on_1))
+          self.assertEqual(i + 1, sess.run(elem_on_2))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(elem_on_1)
+          sess.run(elem_on_2)
+
+  def testUnevenGpu(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with compat.forward_compatibility_horizon(2018, 8, 4):
+      dataset = dataset_ops.Dataset.range(10)
+      multi_device_iterator = prefetching_ops.MultiDeviceIterator(
+          dataset, ["/cpu:1", "/gpu:0"])
+      elem_on_1, elem_on_2 = multi_device_iterator.get_next()
+
+      config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
+      with self.test_session(config=config) as sess:
+        sess.run(multi_device_iterator.initializer)
+        for i in range(0, 10, 2):
+          self.assertEqual(i, sess.run(elem_on_1))
+        for i in range(0, 10, 2):
+          self.assertEqual(i + 1, sess.run(elem_on_2))
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(elem_on_1)
+          sess.run(elem_on_2)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 160d7fe22a9f127f7ee23d7a988c22cc4430ce11..1ad021ea037add48afee5bdfda9eea18485eca5d 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -28,10 +28,12 @@ py_library(
     srcs = ["get_single_element.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":grouping",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -129,6 +131,7 @@ py_library(
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index a4914f4cde71925af477636c91d98b54ce0cce0e..42fc20ec015a078ef8cd42065196f45438f19785 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -515,10 +515,7 @@ def batch_and_drop_remainder(batch_size):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    # TODO(jsimsa): Switch to using `batch(..., drop_remainder=True)` any time
-    # after 6/30/2018.
-    batched = dataset.batch(batch_size)
-    return _filter_irregular_batches(batch_size)(batched)
+    return dataset.batch(batch_size, drop_remainder=True)
 
   return _apply_fn
 
@@ -553,11 +550,9 @@ def padded_batch_and_drop_remainder(batch_size,
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    # TODO(jsimsa): Switch to using `padded_batch(..., drop_remainder=True)`
-    # any time after 6/30/2018.
-    batched = dataset.padded_batch(
-        batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
-    return _filter_irregular_batches(batch_size)(batched)
+    return dataset.padded_batch(
+        batch_size, padded_shapes=padded_shapes, padding_values=padding_values,
+        drop_remainder=True)
 
   return _apply_fn
 
diff --git a/tensorflow/contrib/data/python/ops/get_single_element.py b/tensorflow/contrib/data/python/ops/get_single_element.py
index 0f4cd8e20c5727a5bcfa1dce4dadbfa8f90bd551..ef9284456eb35099db804e0680abfacd6384d503 100644
--- a/tensorflow/contrib/data/python/ops/get_single_element.py
+++ b/tensorflow/contrib/data/python/ops/get_single_element.py
@@ -17,6 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
@@ -68,3 +71,30 @@ def get_single_element(dataset):
   return sparse.deserialize_sparse_tensors(
       nested_ret, dataset.output_types, dataset.output_shapes,
       dataset.output_classes)
+
+
+def reduce_dataset(dataset, reducer):
+  """Returns the result of reducing the `dataset` using `reducer`.
+
+  Args:
+    dataset: A @{tf.data.Dataset} object.
+    reducer: A @{tf.contrib.data.Reducer} object representing the reduce logic.
+
+  Returns:
+    A nested structure of @{tf.Tensor} objects, corresponding to the result
+    of reducing `dataset` using `reducer`.
+
+  Raises:
+    TypeError: if `dataset` is not a `tf.data.Dataset` object.
+  """
+  if not isinstance(dataset, dataset_ops.Dataset):
+    raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
+
+  # The sentinel dataset is used in case the reduced dataset is empty.
+  sentinel_dataset = dataset_ops.Dataset.from_tensors(
+      reducer.finalize_func(reducer.init_func(np.int64(0))))
+  reduced_dataset = dataset.apply(
+      grouping.group_by_reducer(lambda x: np.int64(0), reducer))
+
+  return get_single_element(
+      reduced_dataset.concatenate(sentinel_dataset).take(1))
diff --git a/tensorflow/contrib/data/python/ops/prefetching_ops.py b/tensorflow/contrib/data/python/ops/prefetching_ops.py
index 50212d3b523dda2d74523b83e910f1e8f2991cdd..0edd7c9fe974784f199c272a649b302e72d8c218 100644
--- a/tensorflow/contrib/data/python/ops/prefetching_ops.py
+++ b/tensorflow/contrib/data/python/ops/prefetching_ops.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_dataset_ops as core_gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -480,6 +481,11 @@ class _CopyToDeviceDataset(dataset_ops.Dataset):
 
     self._finalize_func = _remote_finalize_func
     self._finalize_captured_args = _remote_finalize_func.captured_inputs
+
+    g = ops.get_default_graph()
+    _remote_init_func.add_to_graph(g)
+    _remote_next_func.add_to_graph(g)
+    _remote_finalize_func.add_to_graph(g)
     # pylint: enable=protected-scope
 
   # The one_shot_iterator implementation needs a 0 arg _make_dataset function
@@ -518,3 +524,174 @@ class _CopyToDeviceDataset(dataset_ops.Dataset):
   @property
   def output_classes(self):
     return self._input_dataset.output_classes
+
+
+class _PerDeviceGenerator(dataset_ops.Dataset):
+  """A `dummy` generator dataset."""
+
+  def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
+               source_device, target_device, output_shapes, output_types,
+               output_classes):
+    self._target_device = target_device
+    self._output_types = output_types
+    self._output_shapes = output_shapes
+    self._output_classes = output_classes
+    self._flat_output_shapes = nest.flatten(
+        sparse.as_dense_shapes(self._output_shapes, self._output_classes))
+    self._flat_output_types = nest.flatten(
+        sparse.as_dense_types(self._output_types, self._output_classes))
+
+    multi_device_iterator_string_handle = (
+        gen_dataset_ops.multi_device_iterator_to_string_handle(
+            multi_device_iterator_resource))
+
+    @function.Defun()
+    def _init_func():
+      return multi_device_iterator_string_handle
+
+    @function.Defun()
+    def _remote_init_func():
+      return functional_ops.remote_call(
+          target=source_device,
+          args=_init_func.captured_inputs,
+          Tout=[dtypes.string],
+          f=_init_func)
+
+    self._init_func = _remote_init_func
+    self._init_captured_args = _remote_init_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _next_func(string_handle):
+      multi_device_iterator = (
+          gen_dataset_ops.multi_device_iterator_from_string_handle(
+              string_handle=string_handle,
+              output_types=self._flat_output_types,
+              output_shapes=self._flat_output_shapes))
+      return gen_dataset_ops.multi_device_iterator_get_next_from_shard(
+          multi_device_iterator=multi_device_iterator,
+          shard_num=shard_num,
+          incarnation_id=incarnation_id,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+
+    @function.Defun(dtypes.string)
+    def _remote_next_func(string_handle):
+      return functional_ops.remote_call(
+          target=source_device,
+          args=[string_handle] + _next_func.captured_inputs,
+          Tout=self._flat_output_types,
+          f=_next_func)
+
+    self._next_func = _remote_next_func
+    self._next_captured_args = _remote_next_func.captured_inputs
+
+    @function.Defun(dtypes.string)
+    def _finalize_func(unused_string_handle):
+      return array_ops.constant(0, dtypes.int64)
+
+    @function.Defun(dtypes.string)
+    def _remote_finalize_func(string_handle):
+      return functional_ops.remote_call(
+          target=source_device,
+          args=[string_handle] + _finalize_func.captured_inputs,
+          Tout=[dtypes.int64],
+          f=_finalize_func)
+
+    self._finalize_func = _remote_finalize_func
+    self._finalize_captured_args = _remote_finalize_func.captured_inputs
+
+  def _as_variant_tensor(self):
+    with ops.device(self._target_device):
+      return core_gen_dataset_ops.generator_dataset(
+          self._init_captured_args,
+          self._next_captured_args,
+          self._finalize_captured_args,
+          init_func=self._init_func,
+          next_func=self._next_func,
+          finalize_func=self._finalize_func,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
+class MultiDeviceIterator(object):
+  """An iterator over multiple devices."""
+
+  def __init__(self,
+               dataset,
+               devices,
+               prefetch_buffer_size=1,
+               source_device="/cpu:0"):
+    self._dataset = dataset
+    self._devices = devices
+    self._source_device = source_device
+    self._source_device_tensor = ops.convert_to_tensor(source_device)
+
+    self._flat_output_shapes = nest.flatten(
+        sparse.as_dense_shapes(self._dataset.output_shapes,
+                               self._dataset.output_classes))
+    self._flat_output_types = nest.flatten(
+        sparse.as_dense_types(self._dataset.output_types,
+                              self._dataset.output_classes))
+
+    # Create the MultiDeviceIterator.
+    with ops.device(self._source_device):
+      self._multi_device_iterator_resource = (
+          gen_dataset_ops.multi_device_iterator(
+              devices=self._devices,
+              shared_name="",
+              container="",
+              output_types=self._flat_output_types,
+              output_shapes=self._flat_output_shapes))
+
+      # The incarnation ID is used to ensure consistency between the per-device
+      # iterators and the multi-device iterator.
+      self._incarnation_id = gen_dataset_ops.multi_device_iterator_init(
+          self._dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._multi_device_iterator_resource)
+
+    # TODO(rohanj): Explore the possibility of the MultiDeviceIterator to
+    # initialize the device side of the pipeline. This would allow the
+    # MultiDeviceIterator to choose, for example, to move some transformations
+    # into the device side from its input. It might be useful in rewriting.
+    # Create the per device iterators.
+    self._device_iterators = []
+    i = 0
+    for device in self._devices:
+      ds = _PerDeviceGenerator(
+          i, self._multi_device_iterator_resource, self._incarnation_id,
+          self._source_device_tensor, device, self._dataset.output_shapes,
+          self._dataset.output_types, self._dataset.output_classes)
+      ds = ds.prefetch(prefetch_buffer_size)
+      with ops.device(device):
+        self._device_iterators.append(ds.make_initializable_iterator())
+      i += 1
+
+    device_iterator_initializers = [
+        iterator.initializer for iterator in self._device_iterators
+    ]
+    self._initializer = control_flow_ops.group(*device_iterator_initializers)
+
+  def get_next(self):
+    result = []
+    i = 0
+    for device in self._devices:
+      with ops.device(device):
+        result.append(self._device_iterators[i].get_next())
+      i += 1
+    return result
+
+  @property
+  def initializer(self):
+    return self._initializer
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index f5d7e24ae2e3aa76efc50f4da93411f66edea651..cbe741de5a67c231c0982d6d389b3591cff001ec 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -100,6 +100,23 @@ py_library(
     ],
 )
 
+py_library(
+    name = "parameter_server_strategy",
+    srcs = ["parameter_server_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":cross_tower_ops",
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "one_device_strategy",
     srcs = ["one_device_strategy.py"],
@@ -207,6 +224,35 @@ py_test(
     ],
 )
 
+py_test(
+    name = "parameter_server_strategy_test",
+    srcs = ["parameter_server_strategy_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        ":combinations",
+        ":multi_worker_test_base",
+        ":parameter_server_strategy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/estimator:run_config",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index fe3df9cbb95308251581005fdb858cccd5d19a1d..bcb977f64073b1d15ef5c872eb0d6b09d5307b54 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -49,17 +49,23 @@ class CheckpointUtilsWithDistributionStrategyTest(
   def testInitFromCheckpoint(self, distribution, in_tower_mode):
     checkpoint_dir = self.get_temp_dir()
     with self.test_session() as session:
-      v1_value, _, _, _ = checkpoint_utils_test._create_checkpoints(
+      v1_value, v2_value, _, _ = checkpoint_utils_test._create_checkpoints(
           session, checkpoint_dir)
 
     def init_and_verify(g):
       v1 = variable_scope.get_variable("new_var1", [1, 10])
+      v2 = variable_scope.get_variable(
+          "new_var2", [10, 10],
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.MEAN)
       checkpoint_utils.init_from_checkpoint(checkpoint_dir, {
           "var1": "new_var1",
+          "var2": "new_var2"
       })
       with self.test_session(graph=g) as session:
         session.run(variables.global_variables_initializer())
         self.assertAllEqual(v1_value, self.evaluate(v1))
+        self.assertAllEqual(v2_value, self.evaluate(v2))
 
     with ops.Graph().as_default() as g, distribution.scope():
       if in_tower_mode:
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index b0baf0dad1d55eafac5338d1eb43465927e428a1..b6037d2133e23841a7804ed84bca302faa9574e3 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -28,18 +28,37 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_util
 
 
+def check_destinations(destinations):
+  """Checks whether `destinations` is not None and not empty.
+
+  Args:
+    destinations: a DistributedValues, Variable, string or a list of strings.
+
+  Returns:
+    Boolean indicating whether `destinations` is not None and not empty.
+  """
+  # Calling bool() on a ResourceVariable is not allowed.
+  if isinstance(destinations, resource_variable_ops.ResourceVariable):
+    return bool(destinations.device)
+  return bool(destinations)
+
+
 def validate_destinations(destinations):
-  if not isinstance(destinations,
-                    (value_lib.DistributedValues, six.string_types, list)):
+  if not isinstance(
+      destinations,
+      (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
+       six.string_types, list)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a device string, a list of device strings or None")
+                     " a tf.Variable object, a device string, a list of device "
+                     "strings or None")
 
-  if not destinations:
+  if not check_destinations(destinations):
     raise ValueError("destinations can not be empty")
 
 
@@ -59,6 +78,8 @@ def _validate_value_destination_pairs(value_destination_pairs):
 def get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
     return list(destinations.devices)
+  elif isinstance(destinations, resource_variable_ops.ResourceVariable):
+    return [destinations.device]
   elif isinstance(destinations, six.string_types):
     return [device_util.resolve(destinations)]
   else:
@@ -225,7 +246,10 @@ class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
     super(ReductionToOneDeviceCrossTowerOps, self).__init__()
 
   def _reduce(self, aggregation, per_device_value, destinations):
-    devices = get_devices_from(destinations or per_device_value)
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_device_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     reduced = _simple_reduce(per_device_value, reduce_to_device,
                              self.accumulation_fn, aggregation)
@@ -508,7 +532,10 @@ class AllReduceCrossTowerOps(CrossTowerOps):
             logging.WARN,
             "Efficient allreduce is not supported for IndexedSlices.", 10)
 
-      devices = get_devices_from(destinations or per_device_value)
+      if check_destinations(destinations):
+        devices = get_devices_from(destinations)
+      else:
+        devices = get_devices_from(per_device_value)
       reduce_to_device = devices[0]
       reduced = _simple_reduce(per_device_value, reduce_to_device,
                                math_ops.add_n, aggregation)
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index dcbc6b0878b89cbb5b9779de315429e6f9478d15..eb2d102012217026f6edb2256ae05b5ce4e4301e 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import contextlib
 import threading
-import six
 
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
 from tensorflow.contrib.distribute.python import shared_variable_creator
@@ -60,6 +59,156 @@ class _RequestedStop(Exception):
   pass
 
 
+# Make _call_for_each_tower and _reduce_non_distributed_value not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_tower.
+def _call_for_each_tower(distribution, fn, *args, **kwargs):
+  """Run `fn` in separate threads, once per tower/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    *args: positional arguments for `fn`
+    **kwargs: keyword arguments for `fn`.
+        `"run_concurrently"`: Boolean indicating whether executions of `fn`
+           can be run concurrently (under eager execution only), defaults to
+           `True`.
+
+  Returns:
+    Merged return value of `fn` across all towers.
+
+  Raises:
+    RuntimeError: If fn() calls get_tower_context().merge_call() a different
+        number of times from the available devices.
+  """
+  run_concurrently = kwargs.pop("run_concurrently", True)
+  if not context.executing_eagerly():
+    # Lots of TF library code isn't thread-safe in graph mode, and
+    # there is little to be gained by turning on multithreading when
+    # constructing a graph.
+    run_concurrently = False
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+  elif run_concurrently is None:
+    run_concurrently = True
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredStrategy._MirroredTowerThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredTowerThread
+  # (`MTT`) threads. The execution waits until
+  # `MTT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
+  # complete, then `MTT.done` is set to True.  Otherwise, arguments
+  # of `get_tower_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
+  # Each such `get_tower_context().merge_call` call returns the
+  # `MTT.merge_result` for that thread when `MTT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some towers made a different number of "
+                               "tower_context().merge_call() calls.")
+          # get_tower_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MTT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MTT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MTT and assume it is
+          # the same for all other MTTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          with ops.name_scope(mtt_captured_name_scope):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(distribution, aggregation, value,
+                                  destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  if value == 0:
+    return 0
+  if aggregation == variable_scope.VariableAggregation.MEAN:
+    return distribution.broadcast(value, destinations)
+
+  cross_tower_ops_lib.validate_destinations(destinations)
+  if (len(distribution.worker_devices) != 1 or
+      not cross_tower_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value cannot be reduced with the "
+                     "given aggregation.")
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_tower_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices on a single machine.
 
@@ -198,116 +347,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                                                  self._devices)
 
   def _call_for_each_tower(self, fn, *args, **kwargs):
-    """Run `fn` in separate threads, once per tower/worker device.
-
-    Args:
-      fn: function to run (will be run once per device, each in its own thread).
-      *args: positional arguments for `fn`
-      **kwargs: keyword arguments for `fn`.
-          `"run_concurrently"`: Boolean indicating whether executions of `fn`
-             can be run concurrently (under eager execution only), defaults to
-             `True`.
-
-    Returns:
-      Merged return value of `fn` across all towers.
-
-    Raises:
-      RuntimeError: If fn() calls get_tower_context().merge_call() a different
-          number of times for when called for different devices.
-    """
-    run_concurrently = kwargs.pop("run_concurrently", True)
-    if not context.executing_eagerly():
-      # Lots of TF library code isn't thread-safe in graph mode, and
-      # there is little to be gained by turning on multithreading when
-      # constructing a graph.
-      run_concurrently = False
-      # Needed for per-thread device, etc. contexts in graph mode.
-      ops.get_default_graph().switch_to_thread_local()
-    elif run_concurrently is None:
-      run_concurrently = True
-
-    coord = coordinator.Coordinator(
-        clean_stop_exception_types=(_RequestedStop,))
-
-    shared_variable_store = {}
-
-    # TODO(isaprykin): Create these threads once instead of during every run()
-    # call.
-    threads = []
-    for index, d in enumerate(self._devices):
-      variable_creator_fn = shared_variable_creator.make_fn(
-          shared_variable_store, index)
-      t = MirroredStrategy._MirroredTowerThread(
-          self, coord, d, variable_creator_fn, fn,
-          *values.select_device(d, args), **values.select_device(d, kwargs))
-      threads.append(t)
-
-    for t in threads:
-      t.start()
-
-    # When `fn` starts `should_run` event is set on _MirroredTowerThread
-    # (`MTT`) threads. The execution waits until
-    # `MTT.has_paused` is set, which indicates that either `fn` is
-    # complete or a `get_tower_context().merge_call()` is called.  If `fn` is
-    # complete, then `MTT.done` is set to True.  Otherwise, arguments
-    # of `get_tower_context().merge_call` from all paused threads are grouped
-    # and the `merge_fn` is performed.  Results of the
-    # `get_tower_context().merge_call` are then set to `MTT.merge_result`.
-    # Each such `get_tower_context().merge_call` call returns the
-    # `MTT.merge_result` for that thread when `MTT.should_run` event
-    # is reset again. Execution of `fn` resumes.
-
-    try:
-      with coord.stop_on_exception():
-        all_done = False
-        while not all_done and not coord.should_stop():
-          done = []
-          if run_concurrently:
-            for t in threads:
-              t.should_run.set()
-            for t in threads:
-              t.has_paused.wait()
-              t.has_paused.clear()
-              if coord.should_stop():
-                return None
-              done.append(t.done)
-          else:
-            for t in threads:
-              t.should_run.set()
-              t.has_paused.wait()
-              t.has_paused.clear()
-              if coord.should_stop():
-                return None
-              done.append(t.done)
-          if coord.should_stop():
-            return None
-          all_done = all(done)
-          if not all_done:
-            if any(done):
-              raise RuntimeError("Some towers made a different number of "
-                                 "tower_context().merge_call() calls.")
-            # get_tower_context().merge_call() case
-            merge_args = values.regroup(
-                {t.device: t.merge_args for t in threads})
-            merge_kwargs = values.regroup(
-                {t.device: t.merge_kwargs for t in threads})
-            # We capture the name_scope of the MTT when we call merge_fn
-            # to ensure that if we have opened a name scope in the MTT,
-            # it will be respected when executing the merge function. We only
-            # capture the name_scope from the first MTT and assume it is
-            # the same for all other MTTs.
-            mtt_captured_name_scope = threads[0].captured_name_scope
-            with ops.name_scope(mtt_captured_name_scope):
-              merge_result = threads[0].merge_fn(
-                  self, *merge_args, **merge_kwargs)
-            for t in threads:
-              t.merge_result = values.select_device(t.device, merge_result)
-    finally:
-      for t in threads:
-        t.should_run.set()
-      coord.join(threads)
-
-    return values.regroup({t.device: t.main_result for t in threads})
+    return _call_for_each_tower(self, fn, *args, **kwargs)
 
   def map(self, map_over, fn, *args, **kwargs):
     # TODO(josh11b): In eager mode, use one thread per device.
@@ -337,29 +377,9 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
 
   def _reduce(self, aggregation, value, destinations):
     assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.PerDevice):
-      if value == 0:
-        return 0
-      if aggregation == variable_scope.VariableAggregation.MEAN:
-        return self._broadcast(value, destinations)
-
-      cross_tower_ops_lib.validate_destinations(destinations)
-      if len(self._devices) == 1:
-        if destinations:
-          # TODO(anjalisridhar): Moves these methods to a device utility file?
-          devices = cross_tower_ops_lib.get_devices_from(destinations)
-          if len(devices) == 1:
-            with ops.device(devices[0]):
-              return array_ops.identity(value)
-          else:
-            value_updates = {}
-            for d in devices:
-              with ops.device(d):
-                value_updates[d] = array_ops.identity(value)
-            return values.Mirrored(value_updates)
-      raise ValueError("A non PerDevice value cannot be reduced with the given "
-                       "aggregation.")
-
+    if not isinstance(value, values.DistributedValues):
+      return _reduce_non_distributed_value(self, aggregation, value,
+                                           destinations)
     return self._get_cross_tower_ops().reduce(
         aggregation, value, destinations=destinations)
 
@@ -433,15 +453,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
   def _get_devices_from(self, colocate_with=None):
     if colocate_with is None:
       return self._devices
-    elif isinstance(colocate_with, values.DistributedValues):
-      # pylint: disable=protected-access
-      return list(colocate_with._index.keys())
-    elif isinstance(colocate_with, six.string_types):
-      return [device_util.resolve(colocate_with)]
-    elif isinstance(colocate_with, list):
-      return [device_util.resolve(d) for d in colocate_with]
     else:
-      return colocate_with
+      return cross_tower_ops_lib.get_devices_from(colocate_with)
 
   class _MirroredTowerThread(threading.Thread):
     """A thread that runs() a function on a device."""
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 6a14b833d2485e1c672f6cb3e066c56cc19e699c..aab7119901023affaad954c4c4ca7678a2ffee06 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -792,8 +792,8 @@ class MirroredVariableUpdateTest(test.TestCase):
         return mirrored_var.assign(5.0)
 
       with self.assertRaisesRegexp(
-          ValueError, "A non PerDevice value cannot be reduced with the given "
-          "aggregation."):
+          ValueError, "A non-DistributedValues value cannot be reduced with "
+          "the given aggregation."):
         self.evaluate(dist.unwrap(dist.call_for_each_tower(model_fn)))
 
   @test_util.run_in_graph_and_eager_modes(config=config)
@@ -967,5 +967,74 @@ class MirroredAndTowerLocalVariableInitializerTest(test.TestCase):
         self.evaluate(tower_local_var.initializer)
         self.assertTrue(self.evaluate(tower_local_var.is_initialized()))
 
+
+class TowerLocalVariableAssignTest(test.TestCase):
+  config = config_pb2.ConfigProto()
+  config.allow_soft_placement = True
+
+  def _skip_eager_if_gpus_less_than(self, num_gpus):
+    if context.num_gpus() < num_gpus and context.executing_eagerly():
+      self.skipTest("Enough GPUs not available for this test in eager mode.")
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testAssignTowerLocalVarSumAggregation(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      return v_sum
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      tower_local_var = dist.call_for_each_tower(model_fn,
+                                                 run_concurrently=False)
+      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+      self.evaluate(variables.global_variables_initializer())
+      # Each tower has a value of 1.0 assigned to it in tower context.
+      # When we read the value using `read_var` we should see the SUM of each of
+      # values on each of the towers.
+      self.assertEqual(2.0, self.evaluate(dist.read_var(tower_local_var)))
+      # Assigning 6.0 in cross tower context will assign a value of
+      # 6.0/num_towers to each tower.
+      tlv_ops = tower_local_var.assign(6.0)
+      self.evaluate(tlv_ops)
+      # On reading the tower local var we should get the assigned value back.
+      # The value on all the towers are added before being returned by
+      # `read_var`.
+      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testAssignTowerLocalVarMeanAggregation(self):
+    self._skip_eager_if_gpus_less_than(1)
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.MEAN)
+      return v_sum
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with dist.scope():
+      tower_local_var = dist.call_for_each_tower(model_fn,
+                                                 run_concurrently=False)
+      self.assertTrue(isinstance(tower_local_var, values.TowerLocalVariable))
+      self.evaluate(variables.global_variables_initializer())
+      # Each tower has a value of 1.0 assigned to it in tower context.
+      # When we read the value using `read_var` we should see the MEAN of values
+      # on all towers which is the value assigned in tower context.
+      self.assertEqual(1.0, self.evaluate(dist.read_var(tower_local_var)))
+      tlv_ops = tower_local_var.assign(6.0)
+      self.evaluate(tlv_ops)
+      # On reading the tower local var we should get the MEAN of all values
+      # which is equal to the value assigned.
+      self.assertEqual(6.0, self.evaluate(dist.read_var(tower_local_var)))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index f659be5f42594b275af06435cb0c228e5d594ac9..fa479918bd48224d042725566ec905018b974f45 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -28,23 +28,39 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
 
+def create_in_process_cluster(num_workers, num_ps):
+  """Create an in-process cluster that consists of only standard server."""
+  # Leave some memory for cuda runtime.
+  gpu_mem_frac = 0.7 / num_workers
+  worker_config = config_pb2.ConfigProto()
+  worker_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
+
+  ps_config = config_pb2.ConfigProto()
+  ps_config.device_count['GPU'] = 0
+
+  # Create in-process servers. Once an in-process tensorflow server is created,
+  # there is no way to terminate it. So we create one cluster per test process.
+  # We could've started the server in another process, we could then kill that
+  # process to terminate the server. The reasons why we don't want multiple
+  # processes are
+  # 1) it is more difficult to manage these processes
+  # 2) there is something global in CUDA such that if we initialize CUDA in the
+  # parent process, the child process cannot initialize it again and thus cannot
+  # use GPUs (https://stackoverflow.com/questions/22950047).
+  return test_util.create_local_cluster(
+      num_workers,
+      num_ps=num_ps,
+      worker_config=worker_config,
+      ps_config=ps_config)
+
+
 class MultiWorkerTestBase(test.TestCase):
   """Base class for testing multi node strategy and dataset."""
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
-    num_workers = 2
-    # Leave some memory for cuda runtime.
-    gpu_mem_frac = 0.7 / num_workers
-    default_config = config_pb2.ConfigProto()
-    default_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
-
-    # The local cluster takes some portion of the local GPUs and there is no way
-    # for the cluster to terminate unless using multiple processes. Therefore,
-    # we have to only create only one cluster throughout a test process.
-    workers, _ = test_util.create_local_cluster(
-        num_workers, num_ps=0, worker_config=default_config)
+    workers, _ = create_in_process_cluster(num_workers=2, num_ps=0)
     cls._master_target = workers[0].target
 
   @contextlib.contextmanager
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..d53582279e49db8f6fce059288b47eff4450d833
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -0,0 +1,353 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a multi-worker ps DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import values
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import device_setter
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
+
+_LOCAL_CPU = "/device:CPU:0"
+_LOCAL_GPU_0 = "/device:GPU:0"
+
+
+def _normalize_cluster_spec(cluster_spec):
+  if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+    return server_lib.ClusterSpec(cluster_spec)
+  elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+    raise ValueError(
+        "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+        "`tf.train.ClusterDef` object")
+
+
+# TODO(yuefengz): maybe cache variables on local CPU.
+# TODO(yuefengz): we may want to set session options to disallow communication
+# between workers.
+class ParameterServerStrategy(distribute_lib.DistributionStrategy):
+  """A parameter server DistributionStrategy.
+
+  This strategy class works for both local training and between-graph replicated
+  training for multiple workers. If `cluster_spec` is specified, either passed
+  in to __init__() method or parsed from the
+  ["TF_CONFIG" environment
+  variable](https://www.tensorflow.org/api_docs/python/tf/estimator/RunConfig),
+  variables and updates to those variables are assigned to parameter servers and
+  other operations are assigned to workers. If `cluster_spec` is not set, it
+  becomes local training where variables are assigned to local CPU or the only
+  GPU. When each worker has more than one GPU, operations will be replicated on
+  these GPUs. In both cases, operations are replicated but variables are not and
+  these workers share a common view for which paramater server a variable is
+  assigned to.
+
+  This class assumes between-graph replication will be used and works on a graph
+  for a particular worker.
+
+  It is expected to call `call_for_each_tower(fn, *args, **kwargs)` for any
+  operations which potentially can be replicated across towers (i.e. multiple
+  GPUs) even if there is only CPU or one GPU. When defining the `fn`, extra
+  caution needs to be taken:
+
+  1) Always use @{tf.get_variable} instead of @{tf.Variable} which is not able
+  to refer to the same variable on different towers.
+
+  2) It is generally not recommended to open a device scope under the strategy's
+  scope. A device scope (i.e. calling @{tf.device}) will be merged with or
+  override the device for operations but will not change the device for
+  variables.
+
+  3) It is also not recommended to open a colocation scope (i.e. calling
+  @{tf.colocate_with}) under the strategy's scope. For colocating variables,
+  use `distribution.colocate_vars_with` instead. Colocation of ops will possibly
+  create conflicts of device assignement.
+  """
+
+  def __init__(self,
+               num_gpus_per_worker=0,
+               cluster_spec=None,
+               task_type=None,
+               task_id=None):
+    """Initiailizes this strategy.
+
+    Args:
+      num_gpus_per_worker: number of local GPUs or GPUs per worker.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+    """
+    super(ParameterServerStrategy, self).__init__()
+    self._num_gpus_per_worker = num_gpus_per_worker
+    if cluster_spec:
+      cluster_spec = _normalize_cluster_spec(cluster_spec)
+    self._cluster_spec = cluster_spec
+
+    # We typically don't need to do all-reduce in this strategy.
+    self._cross_tower_ops = (
+        cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+            reduce_to_device=_LOCAL_CPU))
+
+    self._initialize_devices(num_gpus_per_worker, cluster_spec, task_type,
+                             task_id)
+
+  def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type,
+                          task_id):
+    """Initialize internal devices.
+
+    It creates variable devices and compute devices. Variables and operations
+    will be assigned to them respectively. We have one compute device per tower.
+    The variable device is a device function or device string. The default
+    variable device assigns variables to parameter servers in a round-robin
+    fashion.
+
+    Args:
+      num_gpus_per_worker: number of local GPUs or GPUs per worker.
+      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
+        cluster configurations.
+      task_type: the current task type.
+      task_id: the current task id.
+
+    Raises:
+      ValueError: if the cluster_spec doesn't have ps jobs.
+    """
+    self._task_type = task_type or "worker"
+    self._task_id = task_id or 0
+    self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)
+
+    # TODO(yuefengz): maybe clearer to split it into two classes, one for
+    # the distribuetd case and one for the local case, once we have the factory
+    # class/method.
+
+    # Define compute devices which is a list of device strings and one for each
+    # tower. When there are GPUs, replicate operations on these GPUs. Otherwise,
+    # place operations on CPU.
+    if cluster_spec is None:
+      # Local mode.
+      if num_gpus_per_worker > 0:
+        self._compute_devices = list(
+            map("/device:GPU:{}".format, range(num_gpus_per_worker)))
+      else:
+        self._compute_devices = [_LOCAL_CPU]
+    else:
+      # Distributed mode.
+      if num_gpus_per_worker > 0:
+        self._compute_devices = [
+            "%s/device:GPU:%d" % (self._worker_device, i)
+            for i in range(num_gpus_per_worker)
+        ]
+      else:
+        self._compute_devices = [self._worker_device]
+
+    self._compute_devices = list(
+        map(device_util.resolve, self._compute_devices))
+    self._canonical_compute_device_set = set(self._compute_devices)
+
+    # Define variable device which is a device string in the local case and a
+    # device function in the distributed case. It is used to open a device scope
+    # where varibles are defined.
+    # The `_parameter_devices` is needed for the `parameter_devices` property
+    # and is a list of all variable devices.
+    if cluster_spec is None:
+      # Local mode. If there is only one GPU, put everything on that GPU.
+      # Otherwise, place variables on CPU.
+      if num_gpus_per_worker == 1:
+        assert len(list(self._compute_devices)) == 1
+        self._variable_device = _LOCAL_GPU_0
+        self._parameter_devices = [_LOCAL_GPU_0]
+      else:
+        self._variable_device = _LOCAL_CPU
+        self._parameter_devices = [_LOCAL_CPU]
+    else:
+      # Distributed mode. Place variables on ps jobs in a round-robin fashion.
+      # Note that devices returned from `replica_device_setter` are not
+      # canonical and therefore we don't canonicalize all variable devices to
+      # make them consistent.
+      # TODO(yuefengz): support passing a strategy object to control variable
+      # assignment.
+      # TODO(yuefengz): merge the logic of replica_device_setter into this
+      # class.
+      num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
+      if num_ps_replicas == 0:
+        raise ValueError("The cluster spec needs to have `ps` jobs.")
+      self._variable_device = device_setter.replica_device_setter(
+          ps_tasks=num_ps_replicas,
+          worker_device=self._worker_device,
+          merge_devices=True,
+          cluster=cluster_spec)
+
+      # Parameter devices are all tasks of the "ps" job.
+      self._parameter_devices = map("/job:ps/task:{}".format,
+                                    range(num_ps_replicas))
+
+    # Define the default device in cross-tower mode. In the distributed case, we
+    # set the default device to the corresponding worker to prevent these ops
+    # from being placed on other workers.
+    if cluster_spec is None:
+      self._default_device = None
+    else:
+      self._default_device = self._worker_device
+
+  def distribute_dataset(self, dataset_fn):
+    """Distributes the dataset to each local GPU."""
+    return values.PerDeviceDataset(
+        self._call_dataset_fn(dataset_fn), self._compute_devices, True)
+
+  def _broadcast(self, tensor, destinations):
+    if not cross_tower_ops_lib.check_destinations(destinations):
+      destinations = self._compute_devices
+    return self._cross_tower_ops.broadcast(tensor, destinations)
+
+  # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
+  # this creator, such as "MutableHashTable".
+  def _create_variable(self, next_creator, *args, **kwargs):
+    if "colocate_with" in kwargs:
+      with ops.device(None):
+        with ops.colocate_with(kwargs["colocate_with"]):
+          return next_creator(*args, **kwargs)
+
+    with ops.colocate_with(None, ignore_existing=True):
+      with ops.device(self._variable_device):
+        return next_creator(*args, **kwargs)
+
+  def _call_for_each_tower(self, fn, *args, **kwargs):
+    # pylint: disable=protected-access
+    return mirrored_strategy._call_for_each_tower(self, fn, *args, **kwargs)
+
+  def _verify_destinations_not_different_worker(self, destinations):
+    if destinations is None:
+      return
+    for d in cross_tower_ops_lib.get_devices_from(destinations):
+      d_spec = tf_device.DeviceSpec.from_string(d)
+      if d_spec.job == self._task_type and d_spec.task != self._task_id:
+        raise ValueError(
+            "Cannot reduce to another worker: %r, current worker is %r" %
+            (d, self._worker_device))
+
+  def _reduce(self, aggregation, value, destinations):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      # pylint: disable=protected-access
+      return mirrored_strategy._reduce_non_distributed_value(
+          self, aggregation, value, destinations)
+
+    return self._cross_tower_ops.reduce(
+        aggregation, value, destinations=destinations)
+
+  def _batch_reduce(self, aggregation, value_destination_pairs):
+    for _, destinations in value_destination_pairs:
+      self._verify_destinations_not_different_worker(destinations)
+    return self._cross_tower_ops.batch_reduce(aggregation,
+                                              value_destination_pairs)
+
+  def _select_single_value(self, structured):
+    """Select any single values in `structured`."""
+
+    def _select_fn(x):  # pylint: disable=g-missing-docstring
+      if isinstance(x, values.Mirrored):
+        if len(x.devices) == 1:
+          return list(x._index.values())[0]  # pylint: disable=protected-access
+        else:
+          raise ValueError(
+              "You cannot update variable with a Mirrored object with multiple "
+              "components %r when using ParameterServerStrategy. You must "
+              "specify a single value or a Mirrored with a single value." % x)
+      elif isinstance(x, values.PerDevice):
+        raise ValueError(
+            "You cannot update variable with a PerDevice object %r when using "
+            "ParameterServerStrategy. You must specify a single value or a "
+            "Mirrored with a single value" % x)
+      else:
+        return x
+
+    return nest.map_structure(_select_fn, structured)
+
+  def _update(self, var, fn, *args, **kwargs):
+    if not isinstance(var, resource_variable_ops.ResourceVariable):
+      raise ValueError(
+          "You can not update `var` %r. It must be a Variable." % var)
+    with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
+      return fn(var, *self._select_single_value(args),
+                **self._select_single_value(kwargs))
+
+  # TODO(yuefengz): does it need to call _select_single_value?
+  def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    with ops.device(
+        colocate_with.device), distribute_lib.UpdateContext(colocate_with):
+      return fn(*args, **kwargs)
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_compute_device_set:
+        return [val.get(device=d) for d in self._compute_devices]
+      return [val.get(device=d) for d in sorted(val.devices)]
+    return [val]
+
+  def read_var(self, var):
+    # No need to distinguish between normal variables and tower-local variables.
+    return array_ops.identity(var)
+
+  def configure(self, session_config=None):
+    del session_config
+
+    # Use TF_CONFIG to get the cluster spec and the current job.
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    cluster_spec = _normalize_cluster_spec(tf_config.get("cluster", {}))
+
+    task_env = tf_config.get("task", {})
+    if task_env:
+      task_type = task_env.get("type", "worker")
+      task_id = int(task_env.get("index", "0"))
+    else:
+      task_type = "worker"
+      task_id = None
+
+    # Set the devices if cluster_spec is defined in TF_CONFIG but not passed in
+    # the constructor.
+    if not self._cluster_spec and cluster_spec:
+      self._cluster_spec = cluster_spec
+      self._initialize_devices(self._num_gpus_per_worker, cluster_spec,
+                               task_type, task_id)
+
+  @property
+  def num_towers(self):
+    return len(self._compute_devices)
+
+  @property
+  def worker_devices(self):
+    # Make a copy to prevent users from accidentally mutating our copy.
+    return list(self._compute_devices)
+
+  @property
+  def parameter_devices(self):
+    return list(self._parameter_devices)
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad538b9e8ee99d3658ef3dbfad9fbe66bcfd2b6d
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -0,0 +1,455 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ParameterServerStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import json
+import threading
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import parameter_server_strategy
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import context
+from tensorflow.python.estimator import run_config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.layers import core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import device_util
+from tensorflow.python.training import distribute as distribute_lib
+
+
+class ParameterServerStrategyTest(test.TestCase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._workers, cls._ps = multi_worker_test_base.create_in_process_cluster(
+        num_workers=3, num_ps=2)
+
+  def setUp(self):
+    self._result = 0
+    self._lock = threading.Lock()
+    self._init_condition = threading.Condition()
+    self._init_reached = 0
+    self._finish_condition = threading.Condition()
+    self._finish_reached = 0
+
+  def _get_ps_distribution_strategy(self, task_type, task_index, num_gpus=0):
+    tf_config = {
+        'cluster': {
+            run_config.TaskType.WORKER: [
+                'fake_worker_0', 'fake_worker_1', 'fake_worker_2'
+            ],
+            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
+        },
+        'task': {
+            'type': task_type,
+            'index': task_index
+        }
+    }
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    with self._lock:
+      # Accessing environment variables should be protected by locks because
+      # environment variables are shared by all threads.
+      with test.mock.patch.dict('os.environ',
+                                {'TF_CONFIG': json.dumps(tf_config)}):
+        distribution.configure()
+    return distribution
+
+  @contextlib.contextmanager
+  def _test_session(self, target):
+    config = config_pb2.ConfigProto(allow_soft_placement=True)
+    config.graph_options.optimizer_options.opt_level = -1
+    with session.Session(graph=None, config=config, target=target) as sess:
+      yield sess
+
+  def _test_device_assignment_distributed(self, d, num_gpus=0):
+    with ops.Graph().as_default(), \
+         self._test_session(target=self._workers[0].target) as sess, \
+         d.scope():
+
+      # Define a variable outside the call_for_each_tower scope. This is not
+      # recommended.
+      n = variable_scope.get_variable('n', initializer=10.0)
+      self.assertEqual(n.device, '/job:ps/task:0')
+
+      def model_fn():
+        if num_gpus == 0:
+          last_part_device = 'device:CPU:0'
+        else:
+          last_part_device = (
+              'device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+        self.assertEqual(b.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+        self.assertEqual(c.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/job:worker/task:0'):
+          x = variable_scope.get_variable('x', initializer=10.0)
+          x_add = x.assign_add(c)
+          e = a + c
+        # The variable x is on the task 1 since the device_function has been
+        # called once before the model_fn.
+        self.assertEqual(x.device, '/job:ps/task:1')
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device,
+                         '/job:worker/replica:0/task:0/%s' % last_part_device)
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x):
+          y = variable_scope.get_variable('y', initializer=20.0)
+        y_add = y.assign_add(x_add)
+        self.assertEqual(y.device, '/job:ps/task:1')
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable('z', initializer=10.0)
+        self.assertEqual(z.device, '/job:ps/task:0')
+        self.assertNotEqual(z.device, x.device)
+
+        with ops.control_dependencies([y_add]):
+          z_add = z.assign_add(y)
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device,
+                         '/job:worker/replica:0/task:1/%s' % last_part_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, '/job:worker/replica:0/task:1/device:CPU:1')
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          v = variable_scope.get_variable('v', initializer=30.0)
+          h = f + 1.0
+        self.assertIn('/job:ps/', u.device)
+        self.assertIn('/job:ps/', v.device)
+        # u and v are on different parameter servers.
+        self.assertTrue(u.device != x.device or v.device != x.device)
+        self.assertTrue(u.device == x.device or v.device == x.device)
+        # Here h is not on one worker. Note h.device is canonical while x.device
+        # is not but.
+        self.assertIn('/job:ps/', h.device)
+        return y_add, z_add, f
+
+      y, z, f = d.call_for_each_tower(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testDeviceAssignmentDistributed(self, num_gpus):
+    d = self._get_ps_distribution_strategy('worker', 1, num_gpus=num_gpus)
+    self._test_device_assignment_distributed(d, num_gpus=num_gpus)
+
+  def _test_device_assignment_local(self,
+                                    d,
+                                    compute_device='CPU',
+                                    variable_device='CPU',
+                                    num_gpus=0):
+    with ops.Graph().as_default(), \
+         self._test_session(target=self._workers[0].target) as sess, \
+         d.scope():
+
+      def model_fn():
+        if 'CPU' in compute_device:
+          tower_compute_device = '/device:CPU:0'
+        else:
+          tower_compute_device = (
+              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+        tower_compute_device = device_util.canonicalize(tower_compute_device)
+
+        if 'CPU' in variable_device:
+          tower_variable_device = '/device:CPU:0'
+        else:
+          tower_variable_device = (
+              '/device:GPU:%d' % distribute_lib.get_tower_context().tower_id)
+        tower_variable_device = device_util.canonicalize(tower_variable_device)
+
+        a = constant_op.constant(1.0)
+        b = constant_op.constant(2.0)
+        c = a + b
+        self.assertEqual(a.device, tower_compute_device)
+        self.assertEqual(b.device, tower_compute_device)
+        self.assertEqual(c.device, tower_compute_device)
+
+        # The device scope is ignored for variables but not for normal ops.
+        with ops.device('/device:GPU:2'):
+          x = variable_scope.get_variable('x', initializer=10.0)
+          x_add = x.assign_add(c)
+          e = a + c
+        self.assertEqual(
+            device_util.canonicalize(x.device), tower_variable_device)
+        self.assertEqual(x_add.device, x.device)
+        self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2'))
+
+        # The colocate_vars_with can override the distribution's device.
+        with d.colocate_vars_with(x):
+          y = variable_scope.get_variable('y', initializer=20.0)
+        y_add = y.assign_add(x_add)
+        self.assertEqual(
+            device_util.canonicalize(y.device), tower_variable_device)
+        self.assertEqual(y_add.device, y.device)
+        self.assertEqual(y.device, x.device)
+
+        z = variable_scope.get_variable('z', initializer=10.0)
+        self.assertEqual(
+            device_util.canonicalize(z.device), tower_variable_device)
+
+        with ops.control_dependencies([y_add]):
+          z_add = z.assign_add(y)
+        with ops.control_dependencies([z_add]):
+          f = z + c
+        self.assertEqual(f.device, tower_compute_device)
+
+        # The device scope would merge with the default worker device.
+        with ops.device('/CPU:1'):
+          g = e + 1.0
+        self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1'))
+
+        # Ths ops.colocate_with will be ignored when defining a variale but not
+        # for a normal tensor.
+        with ops.colocate_with(x):
+          u = variable_scope.get_variable('u', initializer=30.0)
+          h = f + 1.0
+        self.assertEqual(
+            device_util.canonicalize(u.device), tower_variable_device)
+        self.assertEqual(device_util.canonicalize(x.device), h.device)
+        return y_add, z_add, f
+
+      y, z, f = d.call_for_each_tower(model_fn)
+      self.assertNotEqual(y, None)
+      self.assertNotEqual(z, None)
+      self.assertNotEqual(f, None)
+
+      if context.num_gpus() >= 1 and num_gpus <= 1:
+        variables.global_variables_initializer().run()
+        y_val, z_val, f_val = sess.run([y, z, f])
+        self.assertEqual(y_val, 33.0)
+        self.assertEqual(z_val, 43.0)
+        self.assertEqual(f_val, 46.0)
+
+  def testDeviceAssignmentLocal(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=0)
+    self._test_device_assignment_local(
+        distribution, compute_device='CPU', variable_device='CPU', num_gpus=0)
+
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=1)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='GPU', num_gpus=1)
+
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    self._test_device_assignment_local(
+        distribution, compute_device='GPU', variable_device='CPU', num_gpus=2)
+
+  def _test_simple_increment(self, d, task_type, task_index, master_target):
+    if hasattr(d, '_cluster_spec') and d._cluster_spec:
+      num_workers = len(d._cluster_spec.as_dict().get('worker',
+                                                      ['dummy_worker']))
+    else:
+      num_workers = 1
+    with ops.Graph().as_default(), \
+         self._test_session(target=master_target) as sess, \
+         d.scope():
+
+      def model_fn():
+        x = variable_scope.get_variable('x', initializer=10.0)
+        y = variable_scope.get_variable('y', initializer=20.0)
+
+        x_add = x.assign_add(1.0, use_locking=True)
+        y_add = y.assign_add(1.0, use_locking=True)
+
+        train_op = control_flow_ops.group([x_add, y_add])
+        return x, y, train_op
+
+      x, y, train_op = d.call_for_each_tower(model_fn)
+      train_op = d.group(d.unwrap(train_op))
+
+      if context.num_gpus() < d._num_gpus_per_worker:
+        return True
+
+      if task_index == 0:
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != num_workers:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      sess.run(train_op)
+
+      # Wait for other workers to finish training.
+      self._finish_condition.acquire()
+      self._finish_reached += 1
+      while self._finish_reached != num_workers:
+        self._finish_condition.wait()
+      self._finish_condition.notify_all()
+      self._finish_condition.release()
+
+      x_val, y_val = sess.run([x, y])
+      self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_towers)
+      self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_towers)
+      return (x_val == 10.0 + 1.0 * num_workers * d.num_towers and
+              y_val == 20.0 + 1.0 * num_workers * d.num_towers)
+
+  def _test_minimize_loss_graph(self, d, task_type, task_index, master_target):
+    with ops.Graph().as_default(), \
+         self._test_session(target=master_target) as sess, \
+         d.scope():
+      l = core.Dense(1, use_bias=False)
+
+      def loss_fn(x):
+        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
+        return y * y
+
+      # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for
+      # multiple graphs (b/111216820).
+      def grad_fn(x):
+        loss = loss_fn(x)
+        var_list = (
+            variables.trainable_variables() + ops.get_collection(
+                ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
+        grads = gradients.gradients(loss, var_list)
+        ret = list(zip(grads, var_list))
+        return ret
+
+      def update(v, g):
+        return v.assign_sub(0.05 * g, use_locking=True)
+
+      one = d.broadcast(constant_op.constant([[1.]]))
+
+      def step():
+        """Perform one optimization step."""
+        # Run forward & backward to get gradients, variables list.
+        g_v = d.call_for_each_tower(grad_fn, one)
+        # Update the variables using the gradients and the update() function.
+        before_list = []
+        after_list = []
+        for g, v in g_v:
+          fetched = d.read_var(v)
+          before_list.append(fetched)
+          with ops.control_dependencies([fetched]):
+            # TODO(yuefengz): support non-Mirrored variable as destinations.
+            g = d.reduce(
+                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            with ops.control_dependencies(d.unwrap(d.update(v, update, g))):
+              after_list.append(d.read_var(v))
+        return before_list, after_list
+
+      before_out, after_out = step()
+
+      if context.num_gpus() < d._num_gpus_per_worker:
+        return True
+
+      if task_index == 0:
+        variables.global_variables_initializer().run()
+
+      # Workers waiting for chief worker's initializing variables.
+      self._init_condition.acquire()
+      self._init_reached += 1
+      while self._init_reached != 3:
+        self._init_condition.wait()
+      self._init_condition.notify_all()
+      self._init_condition.release()
+
+      for i in range(10):
+        b, a = sess.run((before_out, after_out))
+        if i == 0:
+          before, = b
+        after, = a
+
+      error_before = abs(before - 1)
+      error_after = abs(after - 1)
+      # Error should go down
+      self.assertLess(error_after, error_before)
+      return error_after < error_before
+
+  def _run_client(self, index, model_fn, num_gpus):
+    task_type = run_config.TaskType.WORKER
+    result = model_fn(
+        self._get_ps_distribution_strategy(task_type, index, num_gpus=num_gpus),
+        task_type, index, self._workers[index].target)
+    if result:
+      with self._lock:
+        self._result += 1
+
+  def _run_multiple_clients(self, num_clients, model_fn, num_gpus=0):
+    threads = []
+    for i in range(num_clients):
+      t = threading.Thread(
+          target=self._run_client, args=(i, model_fn, num_gpus))
+      t.start()
+      threads.append(t)
+    for t in threads:
+      t.join()
+
+  def testSimpleBetweenGraph(self):
+    self._run_multiple_clients(3, self._test_simple_increment)
+    self.assertEqual(self._result, 3)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testLocalSimpleIncrement(self, num_gpus):
+    d = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=num_gpus)
+    self._test_simple_increment(d, 'dummy_worker', 0, '')
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2]))
+  def testMinimizeLossGraph(self, num_gpus):
+    self._run_multiple_clients(
+        3, self._test_minimize_loss_graph, num_gpus=num_gpus)
+    self.assertEqual(self._result, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 1761a432512f5f1d0f393928a8110cc19eeaba92..4018b1e02339e377acc0594407a4f89791ff57af 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.distribute.python import prefetching_ops_v2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -77,6 +78,13 @@ class DistributedValues(object):
   def devices(self):
     return list(self._index.keys())
 
+  @property
+  def is_tensor_like(self):
+    for v in self._index.values():
+      if not tensor_util.is_tensor(v):
+        return False
+    return True
+
   def __str__(self):
     return "%s:%s" % (self.__class__.__name__, self._index)
 
@@ -196,11 +204,54 @@ class DistributedVariable(DistributedDelegate):
     # to the container without introducing a reference cycle.
     for v in six.itervalues(index):
       v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
+    # tf.keras keeps track of variables initialized using this attribute. When
+    # tf.keras gets the default session, it initializes all uninitialized vars.
+    # We need to make _keras_initialized a member of DistributedVariable because
+    # without this it will use `__getattr__` which will delegate to a component
+    # variable.
+    self._keras_initialized = False
+    # Typically, a `DistributedVariable`'s initializer is composed of the
+    # initializers of the components variables. However, in some cases, such as
+    # when restoring from a checkpoint, we may set the _initializer_op
+    # property on the entire `DistributedVariable`.
+    self._initializer_op = None
     super(DistributedVariable, self).__init__(index)
 
+  def is_initialized(self, name=None):
+    """Identifies if all the component variables are initialized.
+
+    Args:
+      name: Name of the final `logical_and` op.
+
+    Returns:
+      The op that evaluates to True or False depending on if all the
+      component variables are initialized.
+    """
+    # We have to cast the self._index.values() to a `list` because when we
+    # use `model_to_estimator` to run tf.keras models, self._index.values() is
+    # of type `dict_values` and not `list`.
+    values_list = list(self._index.values())
+    result = values_list[0].is_initialized()
+    # We iterate through the list of values except the last one to allow us to
+    # name the final `logical_and` op the same name that is passed by the user
+    # to the `is_initialized` op. For distributed variables, the
+    # `is_initialized` op is a `logical_and` op.
+    for v in values_list[1:-1]:
+      result = math_ops.logical_and(result, v.is_initialized())
+    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
+                                  name=name)
+    return result
+
   @property
   def initializer(self):
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
+    if self._initializer_op:
+      init_op = self._initializer_op
+    else:
+      # return grouped ops of all the var initializations of component values of
+      # the mirrored variable
+      init_op = control_flow_ops.group(
+          [v.initializer for v in self._index.values()])
+    return init_op
 
   @property
   def graph(self):
@@ -296,12 +347,6 @@ class MirroredVariable(DistributedVariable, Mirrored,
     for v in six.itervalues(index):
       v._mirrored_container = weakref.ref(self)  # pylint: disable=protected-access
     self._primary_var = primary_var
-    # tf.keras keeps track of variables initialized using this attribute. When
-    # tf.keras gets the default session, it initializes all uninitialized vars.
-    # We need to make _keras_initialized a member of MirroredVariable because
-    # without this it will use `__getattr__` which will delegate to a component
-    # variable.
-    self._keras_initialized = False
     self._aggregation = aggregation
     super(MirroredVariable, self).__init__(index)
 
@@ -325,6 +370,7 @@ class MirroredVariable(DistributedVariable, Mirrored,
       return distribute_lib.get_distribution_strategy().update(
           self, f, *args, **kwargs)
     else:
+      _assert_tower_context()
       # We are calling an assign function on the mirrored variable in tower
       # context.
       # We reduce the value we want to assign/add/sub. More details about how we
@@ -357,28 +403,6 @@ class MirroredVariable(DistributedVariable, Mirrored,
     assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
     return self._assign_func(f=assign_fn, *args, **kwargs)
 
-  def is_initialized(self, name=None):
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For mirrored variables, the `is_initialized`
-    # op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-  @property
-  def initializer(self):
-    # return grouped ops of all the var initializations of component values of
-    # the mirrored variable
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
-
   @property
   def aggregation(self):
     return self._aggregation
@@ -443,14 +467,7 @@ class _TowerLocalSaveable(saver.BaseSaverBuilder.SaveableObject):
   def restore(self, restored_tensors, restored_shapes):
     """Restore the same value into all variables."""
     tensor, = restored_tensors
-    # To preserve the sum across save and restore, we have to divide the
-    # total across all devices when restoring a variable that was summed
-    # when saving.
-    if self._tower_local_variable.aggregation == vs.VariableAggregation.SUM:
-      tensor *= 1. / len(self._tower_local_variable.devices)
-    return control_flow_ops.group([
-        _assign_on_device(d, v, tensor)
-        for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
+    return self._tower_local_variable.assign(tensor)
 
 
 def _assert_tower_context():
@@ -466,12 +483,6 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
   def __init__(self, index, primary_var, aggregation):
     self._primary_var = primary_var
     self._aggregation = aggregation
-    # tf.keras keeps track of variables initialized using this attribute. When
-    # tf.keras gets the default session, it initializes all uninitialized vars.
-    # We need to make _keras_initialized a member of TowerLocalVariable because
-    # without this it will use `__getattr__` which will delegate to a component
-    # variable.
-    self._keras_initialized = False
     super(TowerLocalVariable, self).__init__(index)
 
   def assign_sub(self, *args, **kwargs):
@@ -483,30 +494,19 @@ class TowerLocalVariable(DistributedVariable, PerDevice,
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
-    _assert_tower_context()
-    return self.get().assign(*args, **kwargs)
-
-  def is_initialized(self, name=None):
-    # We have to cast the self._index.values() to a `list` because when we
-    # use `model_to_estimator` to run tf.keras models, self._index.values() is
-    # of type `dict_values` and not `list`.
-    values_list = list(self._index.values())
-    result = values_list[0].is_initialized()
-    # We iterate through the list of values except the last one to allow us to
-    # name the final `logical_and` op the same name that is passed by the user
-    # to the `is_initialized` op. For tower local variables, the
-    # `is_initialized` op is a `logical_and` op.
-    for v in values_list[1:-1]:
-      result = math_ops.logical_and(result, v.is_initialized())
-    result = math_ops.logical_and(result, values_list[-1].is_initialized(),
-                                  name=name)
-    return result
-
-  @property
-  def initializer(self):
-    # return grouped ops of all the var initializations of component values of
-    # the tower local variable
-    return control_flow_ops.group([v.initializer for v in self._index.values()])
+    if distribute_lib.get_cross_tower_context():
+      # To preserve the sum across save and restore, we have to divide the
+      # total across all devices when restoring a variable that was summed
+      # when saving.
+      tensor = args[0]
+      if self._aggregation == vs.VariableAggregation.SUM:
+        tensor *= 1. / len(self.devices)
+      return control_flow_ops.group(
+          [_assign_on_device(d, v, tensor)
+           for d, v in six.iteritems(self._index)])
+    else:
+      _assert_tower_context()
+      return self.get().assign(*args, **kwargs)
 
   @property
   def aggregation(self):
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 8e44f2fea16ac851c124b573948ee14ec0640556..91a43d499933c77de846085e0f12abf3064b0499 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -79,6 +80,30 @@ class DistributedValuesTest(test.TestCase):
     with self.assertRaises(AssertionError):
       v = values.DistributedValues({"/device:cpu:0": 42})
 
+  def testIsTensorLike(self):
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = constant_op.constant(2)
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      self.assertTrue(v.is_tensor_like)
+      self.assertTrue(tensor_util.is_tensor(v))
+
+  def testIsTensorLikeWithAConstant(self):
+    with context.graph_mode(), \
+         ops.Graph().as_default(), \
+         ops.device("/device:CPU:0"):
+      one = constant_op.constant(1)
+      two = 2.0
+      v = values.DistributedValues({"/device:CPU:0": one, "/device:GPU:0": two})
+      self.assertEqual(two, v.get("/device:GPU:0"))
+      self.assertEqual(one, v.get())
+      self.assertFalse(v.is_tensor_like)
+      self.assertFalse(tensor_util.is_tensor(v))
+
 
 class DistributedDelegateTest(test.TestCase):
 
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index ef3bdfa75fcaa8df17db1238ceadadf788601356..18a0f754e6e618f240db109f593a80dec57e200b 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -326,6 +326,21 @@ class QuantizedDistribution(distributions.Distribution):
         graph_parents=graph_parents,
         name=name)
 
+  @property
+  def distribution(self):
+    """Base distribution, p(x)."""
+    return self._dist
+
+  @property
+  def low(self):
+    """Lowest value that quantization returns."""
+    return self._low
+
+  @property
+  def high(self):
+    """Highest value that quantization returns."""
+    return self._high
+
   def _batch_shape_tensor(self):
     return self.distribution.batch_shape_tensor()
 
@@ -569,8 +584,3 @@ class QuantizedDistribution(distributions.Distribution):
       dependencies = [distribution_util.assert_integer_form(
           value, message="value has non-integer components.")]
       return control_flow_ops.with_dependencies(dependencies, value)
-
-  @property
-  def distribution(self):
-    """Base distribution, p(x)."""
-    return self._dist
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 58c548d798178a2848006cbf301f7d5cb2143f24..e31dbbe80f9634e8e45ec91bf395eab82942c8ce 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -18,33 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
 from tensorflow.contrib.data.python.ops import prefetching_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
 
-_uid_counter = 0
-_uid_lock = threading.Lock()
-
-
-def _generate_shared_name(prefix):
-  with _uid_lock:
-    global _uid_counter
-    uid = _uid_counter
-    _uid_counter += 1
-  return "{}{}".format(prefix, uid)
-
 
 class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
   """An iterator producing tf.Tensor objects from a tf.data.Dataset.
@@ -80,38 +61,18 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
           "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
           "over the dataset instead.")
 
-    super(Iterator, self).__init__(dataset)
     if not context.context().device_spec.device_type:
       is_remote_device = False
     else:
       is_remote_device = context.context().device_spec.device_type != "CPU"
-    self._buffer_resource_handle = None
     if is_remote_device:
-      with ops.device("/device:CPU:0"):
-        iter_string_handle = gen_dataset_ops.iterator_to_string_handle(
-            self._resource)
-
-        @function.Defun(dtypes.string)
-        def remote_fn(h):
-          remote_iterator = iterator_ops.Iterator.from_string_handle(
-              h, self.output_types, self.output_shapes, self.output_classes)
-          return remote_iterator.get_next()
-
-        remote_fn.add_to_graph(None)
-        target = constant_op.constant("/device:CPU:0")
-      with ops.device(self._device):
-        self._buffer_resource_handle = prefetching_ops.function_buffering_resource(  # pylint: disable=line-too-long
-            string_arg=iter_string_handle,
-            output_types=self._flat_output_types,
-            f=remote_fn,
-            target_device=target,
-            buffer_size=10,
-            container="",
-            shared_name=_generate_shared_name(
-                "contrib_eager_iterator_function_buffer_resource"))
-        self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
-            handle=self._buffer_resource_handle,
-            handle_device=self._device)
+      with ops.device(None):
+        # Let the placer figure out where to place the various functions etc.
+        # created by the CopyToDeviceDataset.
+        dataset = dataset.apply(prefetching_ops.copy_to_device(
+            context.context().device_name))
+        dataset = dataset.prefetch(1)
+    super(Iterator, self).__init__(dataset)
 
   def _next_internal(self):
     """Returns a nested structure of `tf.Tensor`s containing the next element.
@@ -120,16 +81,7 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
     # that there is no more data to iterate over.
     # TODO(b/77291417): Fix
     with context.execution_mode(context.SYNC):
-      if self._buffer_resource_handle is not None:
-        with ops.device(self._device):
-          ret = prefetching_ops.function_buffering_resource_get_next(
-              function_buffer_resource=self._buffer_resource_handle,
-              output_types=self._flat_output_types)
-        return sparse.deserialize_sparse_tensors(
-            nest.pack_sequence_as(self._output_types, ret), self._output_types,
-            self._output_shapes, self._output_classes)
-      else:
-        return super(Iterator, self)._next_internal()
+      return super(Iterator, self)._next_internal()
 
   # TODO(shivaniagrawal): Expose checkpointable stateful objects from dataset
   # attributes(potential).
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 68bec9aee894edd60a025ac1cf87ca3e010db842..acc605247faffcf7ba83891dacdab13fc8c8574a 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -193,6 +193,20 @@ class IteratorTest(test.TestCase):
       x = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], x.numpy())
 
+  def testGpuTensor(self):
+    ds = Dataset.from_tensors([0., 1.])
+    with ops.device(test.gpu_device_name()):
+      for x in ds:
+        y = math_ops.add(x, x)
+    self.assertAllEqual([0., 2.], y.numpy())
+
+  def testGpuDefinedDataset(self):
+    with ops.device(test.gpu_device_name()):
+      ds = Dataset.from_tensors([0., 1.])
+      for x in ds:
+        y = math_ops.add(x, x)
+    self.assertAllEqual([0., 2.], y.numpy())
+
   def testTensorsExplicitPrefetchToDevice(self):
     ds = Dataset.from_tensor_slices([0., 1.])
     ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
index b33243021bb56492a3f5543c3e2a5f07c66f021f..9a4217929916c258b7e8f2e5b3add2905d20d1da 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -29,7 +29,6 @@ import time
 
 import tensorflow as tf
 
-import tensorflow.contrib.eager as tfe
 from tensorflow.examples.tutorials.mnist import input_data
 
 layers = tf.keras.layers
@@ -265,7 +264,7 @@ def train_one_epoch(generator, discriminator, generator_optimizer,
 
 def main(_):
   (device, data_format) = ('/gpu:0', 'channels_first')
-  if FLAGS.no_gpu or tfe.num_gpus() <= 0:
+  if FLAGS.no_gpu or tf.contrib.eager.num_gpus() <= 0:
     (device, data_format) = ('/cpu:0', 'channels_last')
   print('Using device %s, and data format %s.' % (device, data_format))
 
@@ -291,7 +290,7 @@ def main(_):
   latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
   if latest_cpkt:
     print('Using latest checkpoint at ' + latest_cpkt)
-  checkpoint = tfe.Checkpoint(**model_objects)
+  checkpoint = tf.train.Checkpoint(**model_objects)
   # Restore variables on creation if a checkpoint exists.
   checkpoint.restore(latest_cpkt)
 
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
index 43c8c355dc9a07d865a21ebd4c338419747ac1e1..44ff43a1112e771eb6c91c398286a003e17632e0 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/dcgan.ipynb
@@ -27,11 +27,11 @@
         "id": "ITZuApL56Mny"
       },
       "source": [
-        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). To do this, we use Deep Convolutional Generative Adverserial Networks ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)).\n",
+        "This notebook demonstrates how to generate images of handwritten digits using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). To do so, we use Deep Convolutional Generative Adverserial Networks ([DCGAN](https://arxiv.org/pdf/1511.06434.pdf)).\n",
         "\n",
-        "On a colab GPU(Tesla K80), the model takes around 40 seconds per epoch to train.\n",
+        "This model takes about ~30 seconds per epoch (using tf.contrib.eager.defun to create graph functions) to train on a single Tesla K80 on Colab, as of July 2018.\n",
         "\n",
-        "Below is the output generated after training the generator and discriminator models for 100 epochs.\n",
+        "Below is the output generated after training the generator and discriminator models for 150 epochs.\n",
         "\n",
         "![sample output](https://tensorflow.org/images/gan/dcgan.gif)"
       ]
@@ -80,6 +80,8 @@
       },
       "outputs": [],
       "source": [
+        "from __future__ import absolute_import, division, print_function\n",
+        "\n",
         "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
@@ -201,13 +203,13 @@
         "## Write the generator and discriminator models\n",
         "\n",
         "* **Generator** \n",
-        "  * It is responsible for **creating the convincing images good enough to fool the discriminator**.\n",
-        "  * It consists of Conv2DTranspose(Upsampling) layers. We start with a fully connected layer and upsample the image 2 times so as to reach the desired image size(mnist image size) which is (28, 28, 1). \n",
+        "  * It is responsible for **creating convincing images that are good enough to fool the discriminator**.\n",
+        "  * It consists of Conv2DTranspose (Upsampling) layers. We start with a fully connected layer and upsample the image 2 times so as to reach the desired image size (mnist image size) which is (28, 28, 1). \n",
         "  * We use **leaky relu** activation except for the **last layer** which uses **tanh** activation.\n",
         "  \n",
         "* **Discriminator**\n",
         "  * **The discriminator is responsible for classifying the fake images from the real images.**\n",
-        "  * In other words, the discriminator is given generated images(from the generator) and the real MNIST images. The job of the discriminator is to classify these images into fake(generated) and real(MNIST images).\n",
+        "  * In other words, the discriminator is given generated images (from the generator) and the real MNIST images. The job of the discriminator is to classify these images into fake (generated) and real (MNIST images).\n",
         "  * **Basically the generator should be good enough to fool the discriminator that the generated images are real**."
       ]
     },
@@ -312,6 +314,26 @@
         "discriminator = Discriminator()"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "k1HpMSLImuRi"
+      },
+      "outputs": [],
+      "source": [
+        "# Defun gives 10 secs/epoch performance boost\n",
+        "generator.call = tf.contrib.eager.defun(generator.call)\n",
+        "discriminator.call = tf.contrib.eager.defun(discriminator.call)"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -323,8 +345,8 @@
         "\n",
         "* **Discriminator loss**\n",
         "  * The discriminator loss function takes 2 inputs; **real images, generated images**\n",
-        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones(since these are the real images)**\n",
-        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros(since these are the fake images)**\n",
+        "  * real_loss is a sigmoid cross entropy loss of the **real images** and an **array of ones (since these are the real images)**\n",
+        "  * generated_loss is a sigmoid cross entropy loss of the **generated images** and an **array of zeros (since these are the fake images)**\n",
         "  * Then the total_loss is the sum of real_loss and the generated_loss\n",
         "  \n",
         "* **Generator loss**\n",
@@ -411,9 +433,9 @@
         "\n",
         "* We start by iterating over the dataset\n",
         "* The generator is given **noise as an input** which when passed through the generator model will output a image looking like a handwritten digit\n",
-        "* The discriminator is given the **real MNIST images as well as the generated images(from the generator)**.\n",
+        "* The discriminator is given the **real MNIST images as well as the generated images (from the generator)**.\n",
         "* Next, we calculate the generator and the discriminator loss.\n",
-        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables(inputs) and apply those to the optimizer.\n",
+        "* Then, we calculate the gradients of loss with respect to both the generator and the discriminator variables (inputs) and apply those to the optimizer.\n",
         "\n",
         "## Generate Images\n",
         "\n",
@@ -442,7 +464,7 @@
         "noise_dim = 100\n",
         "num_examples_to_generate = 100\n",
         "\n",
-        "# keeping the random vector constant for generation(prediction) so\n",
+        "# keeping the random vector constant for generation (prediction) so\n",
         "# it will be easier to see the improvement of the gan.\n",
         "random_vector_for_generation = tf.random_normal([num_examples_to_generate,\n",
         "                                                 noise_dim])"
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
index 6be09f98dff6627d9bdcc8056eed14e2a621be4b..b173f856c641b4d7dca96adda113f904c97a25a7 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb
@@ -1,31 +1,11 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "text_generation.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
-  },
   "cells": [
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "hcD2nPQvPOFM",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "hcD2nPQvPOFM"
       },
-      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors.\n",
         "\n",
@@ -33,19 +13,19 @@
         "\n",
         "# Text Generation using a RNN\n",
         "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on Github</a></td></table>"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\n",
+        "    \u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e  \n",
+        "\u003c/td\u003e\u003ctd\u003e\n",
+        "\u003ca target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/text_generation.ipynb\"\u003e\u003cimg width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on Github\u003c/a\u003e\u003c/td\u003e\u003c/table\u003e"
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "BwpJ5IffzRG6",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "BwpJ5IffzRG6"
       },
-      "cell_type": "markdown",
       "source": [
         "This notebook demonstrates how to generate text using an RNN using [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager). If you like, you can write a similar [model](https://github.com/fchollet/deep-learning-with-python-notebooks/blob/master/8.1-text-generation-with-lstm.ipynb) using less code. Here, we show a lower-level impementation that's useful to understand as prework before diving in to deeper examples in a similar, like [Neural Machine Translation with Attention](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
         "\n",
@@ -102,58 +82,60 @@
       ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "R3p22DBDsaCA",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "R3p22DBDsaCA"
       },
-      "cell_type": "markdown",
       "source": [
         "## Install unidecode library\n",
         "A helpful library to convert unicode to ASCII."
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "wZ6LOM12wKGH",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "wZ6LOM12wKGH"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "!pip install unidecode"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "WGyKZj3bzf9p",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "WGyKZj3bzf9p"
       },
-      "cell_type": "markdown",
       "source": [
         "## Import tensorflow and enable eager execution."
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "yG_n40gFzf9s",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "yG_n40gFzf9s"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
-        "# Import TensorFlow >= 1.9 and enable eager execution\n",
+        "# Import TensorFlow \u003e= 1.9 and enable eager execution\n",
         "import tensorflow as tf\n",
         "\n",
         "# Note: Once you enable eager execution, it cannot be disabled. \n",
@@ -164,16 +146,14 @@
         "import random\n",
         "import unidecode\n",
         "import time"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "EHDoRoc5PKWz",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "EHDoRoc5PKWz"
       },
-      "cell_type": "markdown",
       "source": [
         "## Download the dataset\n",
         "\n",
@@ -182,76 +162,78 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "pD_55cOxLkAb",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "pD_55cOxLkAb"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
-        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/yashkatariya/shakespeare.txt')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+        "path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')"
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "UHjdCjDuSvX_",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "UHjdCjDuSvX_"
       },
-      "cell_type": "markdown",
       "source": [
         "## Read the dataset\n",
         "\n"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "-E5JvY3wzf94",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "-E5JvY3wzf94"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "text = unidecode.unidecode(open(path_to_file).read())\n",
         "# length of text is the number of characters in it\n",
         "print (len(text))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "Il9ww98izf-D",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "Il9ww98izf-D"
       },
-      "cell_type": "markdown",
       "source": [
         "Creating dictionaries to map from characters to their indices and vice-versa, which will be used to vectorize the inputs"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "IalZLbvOzf-F",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "IalZLbvOzf-F"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# unique contains all the unique characters in the file\n",
         "unique = sorted(set(text))\n",
@@ -259,22 +241,22 @@
         "# creating a mapping from unique characters to indices\n",
         "char2idx = {u:i for i, u in enumerate(unique)}\n",
         "idx2char = {i:u for i, u in enumerate(unique)}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "1v_qUYfAzf-I",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "1v_qUYfAzf-I"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# setting the maximum length sentence we want for a single input in characters\n",
         "max_length = 100\n",
@@ -293,16 +275,14 @@
         "\n",
         "# buffer size to shuffle our dataset\n",
         "BUFFER_SIZE = 10000"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "LFjSVAlWzf-N",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "LFjSVAlWzf-N"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating the input and output tensors\n",
         "\n",
@@ -319,17 +299,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "0UHJDA39zf-O",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "0UHJDA39zf-O"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "input_text = []\n",
         "target_text = []\n",
@@ -343,45 +325,43 @@
         "    \n",
         "print (np.array(input_text).shape)\n",
         "print (np.array(target_text).shape)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "MJdfPmdqzf-R",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "MJdfPmdqzf-R"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating batches and shuffling them using tf.data"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "p2pGotuNzf-S",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "p2pGotuNzf-S"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "dataset = tf.data.Dataset.from_tensor_slices((input_text, target_text)).shuffle(BUFFER_SIZE)\n",
         "dataset = dataset.apply(tf.contrib.data.batch_and_drop_remainder(BATCH_SIZE))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "m8gPwEjRzf-Z",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "m8gPwEjRzf-Z"
       },
-      "cell_type": "markdown",
       "source": [
         "## Creating the model\n",
         "\n",
@@ -393,17 +373,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "P3KTiiInzf-a",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "P3KTiiInzf-a"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "class Model(tf.keras.Model):\n",
         "  def __init__(self, vocab_size, embedding_dim, units, batch_size):\n",
@@ -447,66 +429,64 @@
         "    x = self.fc(output)\n",
         "\n",
         "    return x, states"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "trpqTWyvk0nr",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "trpqTWyvk0nr"
       },
-      "cell_type": "markdown",
       "source": [
         "## Call the model and set the optimizer and the loss function"
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "7t2XrzEOzf-e",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "7t2XrzEOzf-e"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "model = Model(vocab_size, embedding_dim, units, BATCH_SIZE)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "dkjWIATszf-h",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "dkjWIATszf-h"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "optimizer = tf.train.AdamOptimizer()\n",
         "\n",
         "# using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors\n",
         "def loss_function(real, preds):\n",
         "    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "lPrP0XMUzf-p",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "lPrP0XMUzf-p"
       },
-      "cell_type": "markdown",
       "source": [
         "## Train the model\n",
         "\n",
@@ -531,17 +511,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "d4tSNwymzf-q",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "d4tSNwymzf-q"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# Training step\n",
         "\n",
@@ -574,16 +556,14 @@
         "    \n",
         "    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))\n",
         "    print('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "DjGz1tDkzf-u",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "DjGz1tDkzf-u"
       },
-      "cell_type": "markdown",
       "source": [
         "## Predicting using our trained model\n",
         "\n",
@@ -601,17 +581,19 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "WvuwZBX5Ogfd",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "WvuwZBX5Ogfd"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         "# Evaluation step(generating text using the model learned)\n",
         "\n",
@@ -648,16 +630,14 @@
         "    text_generated += idx2char[predicted_id]\n",
         "\n",
         "print (start_string + text_generated)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     },
     {
+      "cell_type": "markdown",
       "metadata": {
-        "id": "AM2Uma_-yVIq",
-        "colab_type": "text"
+        "colab_type": "text",
+        "id": "AM2Uma_-yVIq"
       },
-      "cell_type": "markdown",
       "source": [
         "## Next steps\n",
         "\n",
@@ -668,22 +648,42 @@
       ]
     },
     {
+      "cell_type": "code",
+      "execution_count": 0,
       "metadata": {
-        "id": "gtEd86sX5cB2",
-        "colab_type": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           }
-        }
+        },
+        "colab_type": "code",
+        "id": "gtEd86sX5cB2"
       },
-      "cell_type": "code",
+      "outputs": [],
       "source": [
         ""
-      ],
-      "execution_count": 0,
-      "outputs": []
+      ]
     }
-  ]
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "text_generation.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true,
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/README.md b/tensorflow/contrib/eager/python/examples/l2hmc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d6a2ff7558c76c714df1674c4c8c627fa433f197
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/README.md
@@ -0,0 +1,54 @@
+# L2HMC with TensorFlow eager execution
+
+This folder contains an implementation of [L2HMC](https://arxiv.org/pdf/1711.09268.pdf)  adapted from the released implementation by the authors. The presented implementation runs in both eager and graph mode.
+With eager execution enabled, longer sample chains can be handled compared to graph mode, since no graph is explicitly stored. Moreover, with eager execution enabled, there is no need to use a `tf.while_loop`.
+
+## What is L2HMC?
+L2HMC is an algorithm that learns a non-volume preserving transformation
+for an HMC-like sampling algorithm. More specifically, the non-volume preserving
+transformation is learned with neural nets instantiated within Normalizing Flows
+(more precisely, real-NVPs).
+
+##  Content
+
+- `l2hmc.py`: Dynamics definitions and example energy functions,
+including the 2D strongly correlated Gaussian, the rough well energy function,
+and a Gaussian mixture model.
+- `l2hmc_test.py`: Unit tests and benchmarks for training a sampler on the energy functions in both eager and graph mode.
+- `neural_nets.py`: The neural net for learning the kernel on the 2D strongly correlated example.
+- `main.py`: Run to train a samplers on 2D energy landscapes.
+
+## To run
+- Make sure you have installed TensorFlow 1.9+ or the latest `tf-nightly` or `tf-nightly-gpu` pip package.
+- Execute the command
+
+```bash
+python main.py --train_dir ${PWD}/dump --use_defun
+```
+
+Specifying the optional argument `train_dir` will store event files for
+tensorboard and a plot of sampled chain from the trained sampler.
+
+Specifying the optional argument `use_defun` will let the program use compiled
+graphs when running specific sections and improve the overall speed.
+
+## Boosting Performance with `defun`
+Currently, some models may experience increased overhead with eager execution enabled.
+To improve performance, we could wrap certain functions with the decorator `@tfe.defun`.
+For example, we could wrap the function that does the sampling step:
+
+```python
+@tfe.defun
+def apply_transition(old_sample):
+  new_sample = ...
+  return new_sample
+```
+
+We could also explicitly wrap the desired function with `tfe.defun`:
+
+```python
+apply_transition = tfe.defun(apply_transition)
+```
+
+## Reference
+Generalizing Hamiltonian Monte Carlo with Neural Networks. Levy, Daniel, Hoffman, Matthew D, and Sohl-Dickstein, Jascha. International Conference on Learning Representations (ICLR), 2018.
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
index 275aee513087c2d07147d1cf589b2b9f8d542487..14b8324e488a864cb23ff2507fab1c53c0583bc0 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
@@ -32,20 +32,28 @@ from tensorflow.contrib.eager.python.examples.l2hmc import neural_nets
 
 
 class Dynamics(tf.keras.Model):
-  """Dynamics engine of naive L2HMC sampler.
-
-  Args:
-    x_dim: dimensionality of observed data
-    loglikelihood_fn: log-likelihood function of conditional probability
-    n_steps: number of leapfrog steps within each transition
-    eps: initial value learnable scale of step size
-  """
-
-  def __init__(self, x_dim, loglikelihood_fn, n_steps=25, eps=.1):
+  """Dynamics engine of naive L2HMC sampler."""
+
+  def __init__(self,
+               x_dim,
+               minus_loglikelihood_fn,
+               n_steps=25,
+               eps=.1,
+               np_seed=1):
+    """Initialization.
+
+    Args:
+      x_dim: dimensionality of observed data
+      minus_loglikelihood_fn: log-likelihood function of conditional probability
+      n_steps: number of leapfrog steps within each transition
+      eps: initial value learnable scale of step size
+      np_seed: Random seed for numpy; used to control sampled masks.
+    """
     super(Dynamics, self).__init__()
 
+    npr.seed(np_seed)
     self.x_dim = x_dim
-    self.potential = loglikelihood_fn
+    self.potential = minus_loglikelihood_fn
     self.n_steps = n_steps
 
     self._construct_time()
@@ -68,8 +76,8 @@ class Dynamics(tf.keras.Model):
         position, forward=False)
 
     # Decide direction uniformly
-    forward_mask = tf.cast(
-        tf.random_uniform(shape=[tf.shape(position)[0]]) > .5, tf.float32)
+    batch_size = tf.shape(position)[0]
+    forward_mask = tf.cast(tf.random_uniform((batch_size,)) > .5, tf.float32)
     backward_mask = 1. - forward_mask
 
     # Obtain proposed states
@@ -108,7 +116,6 @@ class Dynamics(tf.keras.Model):
       position_post, momentum_post, logdet = lf_fn(position_post, momentum_post,
                                                    i)
       sumlogdet += logdet
-
     accept_prob = self._compute_accept_prob(position, momentum, position_post,
                                             momentum_post, sumlogdet)
 
@@ -125,17 +132,17 @@ class Dynamics(tf.keras.Model):
     sumlogdet += logdet
 
     position, logdet = self._update_position_forward(position, momentum, t,
-                                                     mask)
+                                                     mask, mask_inv)
     sumlogdet += logdet
 
     position, logdet = self._update_position_forward(position, momentum, t,
-                                                     mask_inv)
+                                                     mask_inv, mask)
     sumlogdet += logdet
 
     momentum, logdet = self._update_momentum_forward(position, momentum, t)
     sumlogdet += logdet
 
-    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+    return position, momentum, sumlogdet
 
   def _backward_lf(self, position, momentum, i):
     """One backward augmented leapfrog step. See Appendix A in paper."""
@@ -149,17 +156,17 @@ class Dynamics(tf.keras.Model):
     sumlogdet += logdet
 
     position, logdet = self._update_position_backward(position, momentum, t,
-                                                      mask)
+                                                      mask_inv, mask)
     sumlogdet += logdet
 
     position, logdet = self._update_position_backward(position, momentum, t,
-                                                      mask_inv)
+                                                      mask, mask_inv)
     sumlogdet += logdet
 
     momentum, logdet = self._update_momentum_backward(position, momentum, t)
     sumlogdet += logdet
 
-    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+    return position, momentum, sumlogdet
 
   def _update_momentum_forward(self, position, momentum, t):
     """Update v in the forward leapfrog step."""
@@ -172,12 +179,11 @@ class Dynamics(tf.keras.Model):
         momentum * tf.exp(scale) -
         .5 * self.eps * (tf.exp(transformed) * grad - translation))
 
-    return momentum, scale
+    return momentum, tf.reduce_sum(scale, axis=1)
 
-  def _update_position_forward(self, position, momentum, t, mask):
+  def _update_position_forward(self, position, momentum, t, mask, mask_inv):
     """Update x in the forward leapfrog step."""
 
-    mask_inv = 1. - mask
     scale, translation, transformed = self.position_fn(
         [momentum, mask * position, t])
     scale *= self.eps
@@ -186,8 +192,7 @@ class Dynamics(tf.keras.Model):
         mask * position +
         mask_inv * (position * tf.exp(scale) + self.eps *
                     (tf.exp(transformed) * momentum + translation)))
-
-    return position, mask_inv * scale
+    return position, tf.reduce_sum(mask_inv * scale, axis=1)
 
   def _update_momentum_backward(self, position, momentum, t):
     """Update v in the backward leapfrog step. Inverting the forward update."""
@@ -200,21 +205,20 @@ class Dynamics(tf.keras.Model):
         tf.exp(scale) * (momentum + .5 * self.eps *
                          (tf.exp(transformed) * grad - translation)))
 
-    return momentum, scale
+    return momentum, tf.reduce_sum(scale, axis=1)
 
-  def _update_position_backward(self, position, momentum, t, mask):
+  def _update_position_backward(self, position, momentum, t, mask, mask_inv):
     """Update x in the backward leapfrog step. Inverting the forward update."""
 
-    mask_inv = 1. - mask
     scale, translation, transformed = self.position_fn(
-        [momentum, mask_inv * position, t])
+        [momentum, mask * position, t])
     scale *= -self.eps
     transformed *= self.eps
     position = (
-        mask_inv * position + mask * tf.exp(scale) *
-        (position - self.eps * tf.exp(transformed) * momentum + translation))
+        mask * position + mask_inv * tf.exp(scale) *
+        (position - self.eps * (tf.exp(transformed) * momentum + translation)))
 
-    return position, mask * scale
+    return position, tf.reduce_sum(mask_inv * scale, axis=1)
 
   def _compute_accept_prob(self, position, momentum, position_post,
                            momentum_post, sumlogdet):
@@ -222,8 +226,10 @@ class Dynamics(tf.keras.Model):
 
     old_hamil = self.hamiltonian(position, momentum)
     new_hamil = self.hamiltonian(position_post, momentum_post)
+    prob = tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))
 
-    return tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))
+    # Ensure numerical stability as well as correct gradients
+    return tf.where(tf.is_finite(prob), prob, tf.zeros_like(prob))
 
   def _construct_time(self):
     """Convert leapfrog step index into sinusoidal time."""
@@ -248,6 +254,8 @@ class Dynamics(tf.keras.Model):
 
     self.masks = []
     for _ in range(self.n_steps):
+      # Need to use npr here because tf would generated different random
+      # values across different `sess.run`
       idx = npr.permutation(np.arange(self.x_dim))[:self.x_dim // 2]
       mask = np.zeros((self.x_dim,))
       mask[idx] = 1.
@@ -273,19 +281,15 @@ class Dynamics(tf.keras.Model):
   def grad_potential(self, position, check_numerics=True):
     """Get gradient of potential function at current location."""
 
-    if not tf.executing_eagerly():
-      # TODO(lxuechen): Change this to tfe.gradients_function when it works
-      grad = tf.gradients(self.potential(position), position)[0]
-    else:
+    if tf.executing_eagerly():
       grad = tfe.gradients_function(self.potential)(position)[0]
-
-    if check_numerics:
-      return tf.check_numerics(grad, message="gradient of potential")
+    else:
+      grad = tf.gradients(self.potential(position), position)[0]
 
     return grad
 
 
-# Examples of unnormalized log density/probabilities
+# Examples of unnormalized log densities
 def get_scg_energy_fn():
   """Get energy function for 2d strongly correlated Gaussian."""
 
@@ -295,32 +299,53 @@ def get_scg_energy_fn():
   sigma_inv = tf.matrix_inverse(sigma)
 
   def energy(x):
-    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
+    """Unnormalized minus log density of 2d strongly correlated Gaussian."""
 
     xmmu = x - mu
     return .5 * tf.diag_part(
         tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
 
-  return energy
+  return energy, mu, sigma
 
 
-def get_multivariate_gaussian_energy_fn(x_dim=2):
-  """Get energy function for 2d strongly correlated Gaussian."""
-
-  mu = tf.random_normal(shape=[x_dim])
-  # Lower triangularize and positive diagonal
-  l = tf.sigmoid(
-      tf.matrix_band_part(tf.random_normal(shape=[x_dim, x_dim]), -1, 0))
-  # Exploit Cholesky decomposition
-  sigma = tf.matmul(l, tf.transpose(l))
-  sigma *= 100.  # Small covariance causes extreme numerical instability
-  sigma_inv = tf.matrix_inverse(sigma)
+def get_rw_energy_fn():
+  """Get energy function for rough well distribution."""
+  # For small eta, the density underlying the rough-well energy is very close to
+  # a unit Gaussian; however, the gradient is greatly affected by the small
+  # cosine perturbations
+  eta = 1e-2
+  mu = tf.constant([0., 0.])
+  sigma = tf.constant([[1., 0.], [0., 1.]])
 
   def energy(x):
-    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
+    ip = tf.reduce_sum(x**2., axis=1)
+    return .5 * ip + eta * tf.reduce_sum(tf.cos(x / eta), axis=1)
 
-    xmmu = x - mu
-    return .5 * tf.diag_part(
-        tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
+  return energy, mu, sigma
+
+
+# Loss function
+def compute_loss(dynamics, x, scale=.1, eps=1e-4):
+  """Compute loss defined in equation (8)."""
+
+  z = tf.random_normal(tf.shape(x))  # Auxiliary variable
+  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
+  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
+
+  # Add eps for numerical stability; following released impl
+  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
+  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
+
+  loss = tf.reduce_mean(
+      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
+
+  return loss, x_out, x_accept_prob
+
+
+def loss_and_grads(dynamics, x, loss_fn=compute_loss):
+  """Obtain loss value and gradients."""
+  with tf.GradientTape() as tape:
+    loss_val, out, accept_prob = loss_fn(dynamics, x)
+  grads = tape.gradient(loss_val, dynamics.trainable_variables)
 
-  return energy
+  return loss_val, grads, out, accept_prob
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
index e33b4cae4c73388dfd78542c9907953f137ad710..955747988536bd21d52df66a35af4aa31b3f7688 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
@@ -37,63 +37,37 @@ def get_default_hparams():
       n_warmup_iters=3)
 
 
-# Relevant functions for benchmarking
-def compute_loss(dynamics, x, scale=.1, eps=1e-4):
-  """Compute loss defined in equation (8)."""
-
-  z = tf.random_normal(tf.shape(x))
-  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
-  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
-
-  # Add eps for numerical stability; following released impl
-  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
-  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
-
-  loss = tf.reduce_mean(
-      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
-
-  return loss, x_out
-
-
-def loss_and_grads(dynamics, x, loss_fn=compute_loss):
-  """Obtain loss value and gradients."""
-
-  with tf.GradientTape() as tape:
-    loss_val, x_out = loss_fn(dynamics, x)
-  grads = tape.gradient(loss_val, dynamics.variables)
-
-  return loss_val, grads, x_out
-
-
-def warmup(dynamics, optimizer, n_iters=1, n_samples=200, loss_fn=compute_loss):
+def warmup(dynamics,
+           optimizer,
+           n_iters=1,
+           n_samples=200,
+           loss_fn=l2hmc.compute_loss):
   """Warmup optimization to reduce overhead."""
 
   samples = tf.random_normal(
       shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
 
   for _ in range(n_iters):
-    _, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
+    _, grads, samples, _ = l2hmc.loss_and_grads(
+        dynamics, samples, loss_fn=loss_fn)
     optimizer.apply_gradients(zip(grads, dynamics.variables))
 
 
 def fit(dynamics,
         samples,
         optimizer,
-        loss_fn=compute_loss,
+        loss_fn=l2hmc.compute_loss,
         n_iters=5000,
         verbose=True,
-        logdir=None,
-        decay_lr=True):
+        logdir=None):
   """Fit L2HMC sampler with given log-likelihood function."""
 
   if logdir:
     summary_writer = tf.contrib.summary.create_file_writer(logdir)
 
   for i in range(n_iters):
-    loss, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
-    # TODO(lxuechen): Proper learning rate decay
-    if decay_lr:
-      grads = [grad * .96**(i // 1000) for grad in grads]
+    loss, grads, samples, _ = l2hmc.loss_and_grads(
+        dynamics, samples, loss_fn=loss_fn)
     optimizer.apply_gradients(zip(grads, dynamics.variables))
     if verbose:
       print("Iteration %d: loss %.4f" % (i, loss))
@@ -112,9 +86,10 @@ class L2hmcTest(tf.test.TestCase):
 
     # Eager mode testing
     hparams = get_default_hparams()
+    energy_fn, _, _ = l2hmc.get_scg_energy_fn()
     dynamics = l2hmc.Dynamics(
         x_dim=hparams.x_dim,
-        loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+        minus_loglikelihood_fn=energy_fn,
         n_steps=hparams.n_steps,
         eps=hparams.eps)
     samples = tf.random_normal(shape=[hparams.n_samples, hparams.x_dim])
@@ -127,9 +102,10 @@ class L2hmcTest(tf.test.TestCase):
 
     # Graph mode testing
     with tf.Graph().as_default():
+      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
       dynamics = l2hmc.Dynamics(
           x_dim=hparams.x_dim,
-          loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+          minus_loglikelihood_fn=energy_fn,
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
@@ -150,32 +126,20 @@ class L2hmcTest(tf.test.TestCase):
 class L2hmcBenchmark(tf.test.Benchmark):
   """Eager and graph benchmarks for l2hmc."""
 
-  def _get_energy_fn(self):
-    """Get specific energy function according to FLAGS."""
-
-    if FLAGS.energy_fn == "scg":
-      energy_fn = l2hmc.get_scg_energy_fn()
-    elif FLAGS.energy_fn == "multivariate_gaussian":
-      energy_fn = l2hmc.get_multivariate_gaussian_energy_fn(x_dim=FLAGS.x_dim)
-    else:
-      raise ValueError("No such energy function %s" % FLAGS.energy_fn)
-
-    return energy_fn
-
   def benchmark_graph(self):
     """Benchmark Graph performance."""
 
     hparams = get_default_hparams()
     tf.reset_default_graph()
     with tf.Graph().as_default():
-      energy_fn = self._get_energy_fn()
+      energy_fn, _, _ = l2hmc.get_scg_energy_fn()
       dynamics = l2hmc.Dynamics(
           x_dim=hparams.x_dim,
-          loglikelihood_fn=energy_fn,
+          minus_loglikelihood_fn=energy_fn,
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
-      loss, x_out = compute_loss(dynamics, x)
+      loss, x_out, _ = l2hmc.compute_loss(dynamics, x)
 
       global_step = tf.Variable(0., name="global_step", trainable=False)
       learning_rate = tf.train.exponential_decay(
@@ -183,7 +147,11 @@ class L2hmcBenchmark(tf.test.Benchmark):
       optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
       train_op = optimizer.minimize(loss, global_step=global_step)
 
-      with tf.Session() as sess:
+      # Single thread; fairer comparison against eager
+      session_conf = tf.ConfigProto(
+          intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
+
+      with tf.Session(config=session_conf) as sess:
         sess.run(tf.global_variables_initializer())
 
         # Warmup to reduce initialization effect when timing
@@ -218,14 +186,14 @@ class L2hmcBenchmark(tf.test.Benchmark):
     """Benchmark Eager performance."""
 
     hparams = get_default_hparams()
-    energy_fn = self._get_energy_fn()
+    energy_fn, _, _ = l2hmc.get_scg_energy_fn()
     dynamics = l2hmc.Dynamics(
         x_dim=hparams.x_dim,
-        loglikelihood_fn=energy_fn,
+        minus_loglikelihood_fn=energy_fn,
         n_steps=hparams.n_steps,
         eps=hparams.eps)
     optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
-    loss_fn = tfe.defun(compute_loss) if defun else compute_loss
+    loss_fn = tfe.defun(l2hmc.compute_loss) if defun else l2hmc.compute_loss
 
     # Warmup to reduce initialization effect when timing
     warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters, loss_fn=loss_fn)
@@ -234,12 +202,7 @@ class L2hmcBenchmark(tf.test.Benchmark):
     samples = tf.random_normal(
         shape=[hparams.n_samples, hparams.x_dim], dtype=tf.float32)
     start_time = time.time()
-    fit(dynamics,
-        samples,
-        optimizer,
-        loss_fn=loss_fn,
-        n_iters=hparams.n_iters,
-        decay_lr=True)
+    fit(dynamics, samples, optimizer, loss_fn=loss_fn, n_iters=hparams.n_iters)
     wall_time = time.time() - start_time
     examples_per_sec = hparams.n_samples / wall_time
 
@@ -251,14 +214,8 @@ class L2hmcBenchmark(tf.test.Benchmark):
         wall_time=wall_time)
 
     del dynamics
-    del loss_fn
 
 
 if __name__ == "__main__":
-  tf.flags.DEFINE_string("energy_fn", "scg",
-                         ("The energy function/unnormalized log-probability. "
-                          "Either be `scg` or `multivariate_gaussian`"))
-  tf.flags.DEFINE_integer("x_dim", 2, "Dimensionality of observation space.")
-  FLAGS = tf.flags.FLAGS
   tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/main.py b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..45e1f98429f48749d374c2aefd8874690c3830ad
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/main.py
@@ -0,0 +1,235 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""L2HMC on simple Gaussian mixture model with TensorFlow eager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from absl import flags
+import numpy as np
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.l2hmc import l2hmc
+try:
+  import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
+  HAS_MATPLOTLIB = True
+except ImportError:
+  HAS_MATPLOTLIB = False
+tfe = tf.contrib.eager
+
+
+def main(_):
+  tf.enable_eager_execution()
+  global_step = tf.train.get_or_create_global_step()
+  global_step.assign(1)
+
+  energy_fn, mean, covar = {
+      "scg": l2hmc.get_scg_energy_fn(),
+      "rw": l2hmc.get_rw_energy_fn()
+  }[FLAGS.energy_fn]
+
+  x_dim = 2
+  train_iters = 5000
+  eval_iters = 2000
+  eps = 0.1
+  n_steps = 10  # Chain length
+  n_samples = 200
+  record_loss_every = 100
+
+  dynamics = l2hmc.Dynamics(
+      x_dim=x_dim, minus_loglikelihood_fn=energy_fn, n_steps=n_steps, eps=eps)
+  learning_rate = tf.train.exponential_decay(
+      1e-3, global_step, 1000, 0.96, staircase=True)
+  optimizer = tf.train.AdamOptimizer(learning_rate)
+  checkpointer = tf.train.Checkpoint(
+      optimizer=optimizer, dynamics=dynamics, global_step=global_step)
+
+  if FLAGS.train_dir:
+    summary_writer = tf.contrib.summary.create_file_writer(FLAGS.train_dir)
+    if FLAGS.restore:
+      latest_path = tf.train.latest_checkpoint(FLAGS.train_dir)
+      checkpointer.restore(latest_path)
+      print("Restored latest checkpoint at path:\"{}\" ".format(latest_path))
+      sys.stdout.flush()
+
+  if not FLAGS.restore:
+    # Training
+    if FLAGS.use_defun:
+      # Use `tfe.deun` to boost performance when there are lots of small ops
+      loss_fn = tfe.defun(l2hmc.compute_loss)
+    else:
+      loss_fn = l2hmc.compute_loss
+
+    samples = tf.random_normal(shape=[n_samples, x_dim])
+    for i in range(1, train_iters + 1):
+      loss, samples, accept_prob = train_one_iter(
+          dynamics,
+          samples,
+          optimizer,
+          loss_fn=loss_fn,
+          global_step=global_step)
+
+      if i % record_loss_every == 0:
+        print("Iteration {}, loss {:.4f}, x_accept_prob {:.4f}".format(
+            i, loss.numpy(),
+            accept_prob.numpy().mean()))
+        if FLAGS.train_dir:
+          with summary_writer.as_default():
+            with tf.contrib.summary.always_record_summaries():
+              tf.contrib.summary.scalar("Training loss", loss, step=global_step)
+    print("Training complete.")
+    sys.stdout.flush()
+
+    if FLAGS.train_dir:
+      saved_path = checkpointer.save(
+          file_prefix=os.path.join(FLAGS.train_dir, "ckpt"))
+      print("Saved checkpoint at path: \"{}\" ".format(saved_path))
+      sys.stdout.flush()
+
+  # Evaluation
+  if FLAGS.use_defun:
+    # Use tfe.deun to boost performance when there are lots of small ops
+    apply_transition = tfe.defun(dynamics.apply_transition)
+  else:
+    apply_transition = dynamics.apply_transition
+
+  samples = tf.random_normal(shape=[n_samples, x_dim])
+  samples_history = []
+  for i in range(eval_iters):
+    samples_history.append(samples.numpy())
+    _, _, _, samples = apply_transition(samples)
+  samples_history = np.array(samples_history)
+  print("Sampling complete.")
+  sys.stdout.flush()
+
+  # Mean and covariance of target distribution
+  mean = mean.numpy()
+  covar = covar.numpy()
+  ac_spectrum = compute_ac_spectrum(samples_history, mean, covar)
+  print("First 25 entries of the auto-correlation spectrum: {}".format(
+      ac_spectrum[:25]))
+  ess = compute_ess(ac_spectrum)
+  print("Effective sample size per Metropolis-Hastings step: {}".format(ess))
+  sys.stdout.flush()
+
+  if FLAGS.train_dir:
+    # Plot autocorrelation spectrum in tensorboard
+    plot_step = tfe.Variable(1, trainable=False, dtype=tf.int64)
+
+    for ac in ac_spectrum:
+      with summary_writer.as_default():
+        with tf.contrib.summary.always_record_summaries():
+          tf.contrib.summary.scalar("Autocorrelation", ac, step=plot_step)
+      plot_step.assign(plot_step + n_steps)
+
+    if HAS_MATPLOTLIB:
+      # Choose a single chain and plot the trajectory
+      single_chain = samples_history[:, 0, :]
+      xs = single_chain[:100, 0]
+      ys = single_chain[:100, 1]
+      plt.figure()
+      plt.plot(xs, ys, color="orange", marker="o", alpha=0.6)  # Trained chain
+      plt.savefig(os.path.join(FLAGS.train_dir, "single_chain.png"))
+
+
+def train_one_iter(dynamics,
+                   x,
+                   optimizer,
+                   loss_fn=l2hmc.compute_loss,
+                   global_step=None):
+  """Train the sampler for one iteration."""
+  loss, grads, out, accept_prob = l2hmc.loss_and_grads(
+      dynamics, x, loss_fn=loss_fn)
+  optimizer.apply_gradients(
+      zip(grads, dynamics.trainable_variables), global_step=global_step)
+
+  return loss, out, accept_prob
+
+
+def compute_ac_spectrum(samples_history, target_mean, target_covar):
+  """Compute autocorrelation spectrum.
+
+  Follows equation 15 from the L2HMC paper.
+
+  Args:
+    samples_history: Numpy array of shape [T, B, D], where T is the total
+        number of time steps, B is the batch size, and D is the dimensionality
+        of sample space.
+    target_mean: 1D Numpy array of the mean of target(true) distribution.
+    target_covar: 2D Numpy array representing a symmetric matrix for variance.
+  Returns:
+    Autocorrelation spectrum, Numpy array of shape [T-1].
+  """
+
+  # Using numpy here since eager is a bit slow due to the loop
+  time_steps = samples_history.shape[0]
+  trace = np.trace(target_covar)
+
+  rhos = []
+  for t in range(time_steps - 1):
+    rho_t = 0.
+    for tau in range(time_steps - t):
+      v_tau = samples_history[tau, :, :] - target_mean
+      v_tau_plus_t = samples_history[tau + t, :, :] - target_mean
+      # Take dot product over observation dims and take mean over batch dims
+      rho_t += np.mean(np.sum(v_tau * v_tau_plus_t, axis=1))
+
+    rho_t /= trace * (time_steps - t)
+    rhos.append(rho_t)
+
+  return np.array(rhos)
+
+
+def compute_ess(ac_spectrum):
+  """Compute the effective sample size based on autocorrelation spectrum.
+
+  This follows equation 16 from the L2HMC paper.
+
+  Args:
+    ac_spectrum: Autocorrelation spectrum
+  Returns:
+    The effective sample size
+  """
+  # Cutoff from the first value less than 0.05
+  cutoff = np.argmax(ac_spectrum[1:] < .05)
+  if cutoff == 0:
+    cutoff = len(ac_spectrum)
+  ess = 1. / (1. + 2. * np.sum(ac_spectrum[1:cutoff]))
+  return ess
+
+
+if __name__ == "__main__":
+  flags.DEFINE_string(
+      "train_dir",
+      default=None,
+      help="[Optional] Directory to store the training information")
+  flags.DEFINE_boolean(
+      "restore",
+      default=False,
+      help="[Optional] Restore the latest checkpoint from `train_dir` if True")
+  flags.DEFINE_boolean(
+      "use_defun",
+      default=False,
+      help="[Optional] Use `tfe.defun` to boost performance")
+  flags.DEFINE_string(
+      "energy_fn",
+      default="scg",
+      help="[Optional] The energy function used for experimentation"
+      "Other options include `rw`")
+  FLAGS = flags.FLAGS
+  tf.app.run(main)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/BUILD b/tensorflow/contrib/eager/python/examples/revnet/BUILD
index 0c0e4c0eb9d1a97eaca0fafa5df2517ef644fc95..4f0d46b1bae3760a63b2abe871034bdedf258f07 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/revnet/BUILD
@@ -43,6 +43,27 @@ py_library(
     ],
 )
 
+py_library(
+    name = "resnet_preprocessing",
+    srcs = ["resnet_preprocessing.py"],
+    srcs_version = "PY2AND3",
+    tags = ["local"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "imagenet_input",
+    srcs = ["imagenet_input.py"],
+    srcs_version = "PY2AND3",
+    tags = ["local"],
+    deps = [
+        ":resnet_preprocessing",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 # Tests
 cuda_py_test(
     name = "ops_test",
@@ -113,3 +134,39 @@ py_binary(
         "//tensorflow:tensorflow_py",
     ],
 )
+
+py_binary(
+    name = "main_estimator",
+    srcs = ["main_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "main_estimator_lib",
+    srcs = ["main_estimator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "main_estimator_tpu_lib",
+    srcs = ["main_estimator_tpu.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cifar_input",
+        ":main",
+        ":revnet",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks.py b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
index 306096e9f8c4da0ed7f893ae75067cd24e7274b1..89712f2c4503e622af17c37ec43f472f11210254 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks.py
@@ -24,6 +24,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import operator
+
 import tensorflow as tf
 from tensorflow.contrib.eager.python.examples.revnet import ops
 
@@ -45,7 +48,7 @@ class RevBlock(tf.keras.Model):
                bottleneck=False,
                fused=True,
                dtype=tf.float32):
-    """Initialize RevBlock.
+    """Initialization.
 
     Args:
       n_res: number of residual blocks
@@ -88,49 +91,27 @@ class RevBlock(tf.keras.Model):
       h = block(h, training=training)
     return h
 
-  def backward_grads_and_vars(self, x, y, dy, training=True):
+  def backward_grads(self, x, y, dy, training=True):
     """Apply reversible block backward to outputs."""
 
     grads_all = []
-    vars_all = []
-
     for i in reversed(range(len(self.blocks))):
       block = self.blocks[i]
       if i == 0:
         # First block usually contains downsampling that can't be reversed
-        with tf.GradientTape() as tape:
-          x = tf.identity(x)
-          tape.watch(x)
-          y = block(x, training=training)
-
-        grads_combined = tape.gradient(
-            y, [x] + block.trainable_variables, output_gradients=dy)
-        dy = grads_combined[0]
-        grads_all += grads_combined[1:]
-        vars_all += block.trainable_variables
+        dy, grads = block.backward_grads_with_downsample(
+            x, y, dy, training=True)
       else:
-        y, dy, grads, vars_ = block.backward_grads_and_vars(
-            y, dy, training=training)
-        grads_all += grads
-        vars_all += vars_
+        y, dy, grads = block.backward_grads(y, dy, training=training)
+      grads_all = grads + grads_all
 
-    return dy, grads_all, vars_all
+    return dy, grads_all
 
 
 class _Residual(tf.keras.Model):
   """Single residual block contained in a _RevBlock. Each `_Residual` object has
   two _ResidualInner objects, corresponding to the `F` and `G` functions in the
   paper.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC",
-    bottleneck: use bottleneck residual if True
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
   """
 
   def __init__(self,
@@ -142,6 +123,18 @@ class _Residual(tf.keras.Model):
                bottleneck=False,
                fused=True,
                dtype=tf.float32):
+    """Initialization.
+
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC",
+      bottleneck: use bottleneck residual if True
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
     super(_Residual, self).__init__()
 
     self.filters = filters
@@ -191,167 +184,324 @@ class _Residual(tf.keras.Model):
 
     return tf.concat([y1, y2], axis=self.axis)
 
-  def backward_grads_and_vars(self, y, dy, training=True):
+  def backward_grads(self, y, dy, training=True):
     """Manually compute backward gradients given input and output grads."""
     dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self.axis)
+    y1, y2 = tf.split(y, num_or_size_splits=2, axis=self.axis)
 
-    with tf.GradientTape(persistent=True) as tape:
-      y = tf.identity(y)
-      tape.watch(y)
-      y1, y2 = tf.split(y, num_or_size_splits=2, axis=self.axis)
-      z1 = y1
-      gz1 = self.g(z1, training=training)
-      x2 = y2 - gz1
+    with tf.GradientTape() as gtape:
+      gtape.watch(y1)
+      gy1 = self.g(y1, training=training)
+    grads_combined = gtape.gradient(
+        gy1, [y1] + self.g.trainable_variables, output_gradients=dy2)
+    dg = grads_combined[1:]
+    dx1 = dy1 + grads_combined[0]
+    # This doesn't affect eager execution, but improves memory efficiency with
+    # graphs
+    with tf.control_dependencies(dg + [dx1]):
+      x2 = y2 - gy1
+
+    with tf.GradientTape() as ftape:
+      ftape.watch(x2)
       fx2 = self.f(x2, training=training)
-      x1 = z1 - fx2
+    grads_combined = ftape.gradient(
+        fx2, [x2] + self.f.trainable_variables, output_gradients=dx1)
+    df = grads_combined[1:]
+    dx2 = dy2 + grads_combined[0]
+    # Same behavior as above
+    with tf.control_dependencies(df + [dx2]):
+      x1 = y1 - fx2
 
-    grads_combined = tape.gradient(
-        gz1, [z1] + self.g.trainable_variables, output_gradients=dy2)
-    dz1 = dy1 + grads_combined[0]
+    x = tf.concat([x1, x2], axis=self.axis)
+    dx = tf.concat([dx1, dx2], axis=self.axis)
+    grads = df + dg
+
+    return x, dx, grads
+
+  def backward_grads_with_downsample(self, x, y, dy, training=True):
+    """Manually compute backward gradients given input and output grads."""
+    # Splitting this from `backward_grads` for better readability
+    x1, x2 = tf.split(x, num_or_size_splits=2, axis=self.axis)
+    y1, _ = tf.split(y, num_or_size_splits=2, axis=self.axis)
+    dy1, dy2 = tf.split(dy, num_or_size_splits=2, axis=self.axis)
+
+    with tf.GradientTape() as gtape:
+      gtape.watch(y1)
+      gy1 = self.g(y1, training=training)
+    grads_combined = gtape.gradient(
+        gy1, [y1] + self.g.trainable_variables, output_gradients=dy2)
     dg = grads_combined[1:]
-    dx1 = dz1
+    dz1 = dy1 + grads_combined[0]
 
-    grads_combined = tape.gradient(
+    # dx1 need one more step to backprop through downsample
+    with tf.GradientTape() as x1tape:
+      x1tape.watch(x1)
+      z1 = ops.downsample(x1, self.filters // 2, self.strides, axis=self.axis)
+    dx1 = x1tape.gradient(z1, x1, output_gradients=dz1)
+
+    with tf.GradientTape() as ftape:
+      ftape.watch(x2)
+      fx2 = self.f(x2, training=training)
+    grads_combined = ftape.gradient(
         fx2, [x2] + self.f.trainable_variables, output_gradients=dz1)
-    dx2 = dy2 + grads_combined[0]
-    df = grads_combined[1:]
+    dx2, df = grads_combined[0], grads_combined[1:]
 
-    del tape
+    # dx2 need one more step to backprop through downsample
+    with tf.GradientTape() as x2tape:
+      x2tape.watch(x2)
+      z2 = ops.downsample(x2, self.filters // 2, self.strides, axis=self.axis)
+    dx2 += x2tape.gradient(z2, x2, output_gradients=dy2)
 
-    grads = df + dg
-    vars_ = self.f.trainable_variables + self.g.trainable_variables
-
-    x = tf.concat([x1, x2], axis=self.axis)
     dx = tf.concat([dx1, dx2], axis=self.axis)
+    grads = df + dg
 
-    return x, dx, grads, vars_
+    return dx, grads
 
 
-def _BottleneckResidualInner(filters,
-                             strides,
-                             input_shape,
-                             batch_norm_first=True,
-                             data_format="channels_first",
-                             fused=True,
-                             dtype=tf.float32):
+# Ideally, the following should be wrapped in `tf.keras.Sequential`, however
+# there are subtle issues with its placeholder insertion policy and batch norm
+class _BottleneckResidualInner(tf.keras.Model):
   """Single bottleneck residual inner function contained in _Resdual.
 
   Corresponds to the `F`/`G` functions in the paper.
   Suitable for training on ImageNet dataset.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC"
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
-
-  Returns:
-    A keras model
   """
 
-  axis = 1 if data_format == "channels_first" else 3
-  model = tf.keras.Sequential()
-  if batch_norm_first:
-    model.add(
-        tf.keras.layers.BatchNormalization(
-            axis=axis, input_shape=input_shape, fused=fused, dtype=dtype))
-    model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters // 4,
-          kernel_size=1,
-          strides=strides,
-          input_shape=input_shape,
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters // 4,
-          kernel_size=3,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=1,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
+  def __init__(self,
+               filters,
+               strides,
+               input_shape,
+               batch_norm_first=True,
+               data_format="channels_first",
+               fused=True,
+               dtype=tf.float32):
+    """Initialization.
 
-  return model
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC"
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
+    super(_BottleneckResidualInner, self).__init__()
+    axis = 1 if data_format == "channels_first" else 3
+    if batch_norm_first:
+      self.batch_norm_0 = tf.keras.layers.BatchNormalization(
+          axis=axis, input_shape=input_shape, fused=fused, dtype=dtype)
+    self.conv2d_1 = tf.keras.layers.Conv2D(
+        filters=filters // 4,
+        kernel_size=1,
+        strides=strides,
+        input_shape=input_shape,
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
 
+    self.batch_norm_1 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_2 = tf.keras.layers.Conv2D(
+        filters=filters // 4,
+        kernel_size=3,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_2 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_3 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=1,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_first = batch_norm_first
+
+  def call(self, x, training=True):
+    net = x
+    if self.batch_norm_first:
+      net = self.batch_norm_0(net, training=training)
+      net = tf.nn.relu(net)
+    net = self.conv2d_1(net)
 
-def _ResidualInner(filters,
-                   strides,
-                   input_shape,
-                   batch_norm_first=True,
-                   data_format="channels_first",
-                   fused=True,
-                   dtype=tf.float32):
+    net = self.batch_norm_1(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_2(net)
+
+    net = self.batch_norm_2(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_3(net)
+
+    return net
+
+
+class _ResidualInner(tf.keras.Model):
   """Single residual inner function contained in _ResdualBlock.
 
   Corresponds to the `F`/`G` functions in the paper.
-
-  Args:
-    filters: output filter size
-    strides: length 2 list/tuple of integers for height and width strides
-    input_shape: length 3 list/tuple of integers
-    batch_norm_first: whether to apply activation and batch norm before conv
-    data_format: tensor data format, "NCHW"/"NHWC"
-    fused: use fused batch normalization if True
-    dtype: float16, float32, or float64
-
-  Returns:
-    A keras model
   """
 
-  axis = 1 if data_format == "channels_first" else 3
-  model = tf.keras.Sequential()
-  if batch_norm_first:
-    model.add(
-        tf.keras.layers.BatchNormalization(
-            axis=axis, input_shape=input_shape, fused=fused, dtype=dtype))
-    model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=3,
-          strides=strides,
-          input_shape=input_shape,
-          data_format=data_format,
-          use_bias=False,
-          padding="SAME",
-          dtype=dtype))
-
-  model.add(
-      tf.keras.layers.BatchNormalization(axis=axis, fused=fused, dtype=dtype))
-  model.add(tf.keras.layers.Activation("relu"))
-  model.add(
-      tf.keras.layers.Conv2D(
-          filters=filters,
-          kernel_size=3,
-          strides=(1, 1),
-          data_format=data_format,
-          use_bias=False,
+  def __init__(self,
+               filters,
+               strides,
+               input_shape,
+               batch_norm_first=True,
+               data_format="channels_first",
+               fused=True,
+               dtype=tf.float32):
+    """Initialization.
+
+    Args:
+      filters: output filter size
+      strides: length 2 list/tuple of integers for height and width strides
+      input_shape: length 3 list/tuple of integers
+      batch_norm_first: whether to apply activation and batch norm before conv
+      data_format: tensor data format, "NCHW"/"NHWC"
+      fused: use fused batch normalization if True
+      dtype: float16, float32, or float64
+    """
+    super(_ResidualInner, self).__init__()
+    axis = 1 if data_format == "channels_first" else 3
+    if batch_norm_first:
+      self.batch_norm_0 = tf.keras.layers.BatchNormalization(
+          axis=axis, input_shape=input_shape, fused=fused, dtype=dtype)
+    self.conv2d_1 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=3,
+        strides=strides,
+        input_shape=input_shape,
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_1 = tf.keras.layers.BatchNormalization(
+        axis=axis, fused=fused, dtype=dtype)
+    self.conv2d_2 = tf.keras.layers.Conv2D(
+        filters=filters,
+        kernel_size=3,
+        strides=(1, 1),
+        data_format=data_format,
+        use_bias=False,
+        padding="SAME",
+        dtype=dtype)
+
+    self.batch_norm_first = batch_norm_first
+
+  def call(self, x, training=True):
+    net = x
+    if self.batch_norm_first:
+      net = self.batch_norm_0(net, training=training)
+      net = tf.nn.relu(net)
+    net = self.conv2d_1(net)
+
+    net = self.batch_norm_1(net, training=training)
+    net = tf.nn.relu(net)
+    net = self.conv2d_2(net)
+
+    return net
+
+
+class InitBlock(tf.keras.Model):
+  """Initial block of RevNet."""
+
+  def __init__(self, config):
+    """Initialization.
+
+    Args:
+      config: tf.contrib.training.HParams object; specifies hyperparameters
+    """
+    super(InitBlock, self).__init__()
+    self.config = config
+    self.axis = 1 if self.config.data_format == "channels_first" else 3
+    self.conv2d = tf.keras.layers.Conv2D(
+        filters=self.config.init_filters,
+        kernel_size=self.config.init_kernel,
+        strides=(self.config.init_stride, self.config.init_stride),
+        data_format=self.config.data_format,
+        use_bias=False,
+        padding="SAME",
+        input_shape=self.config.input_shape,
+        dtype=self.config.dtype)
+    self.batch_norm = tf.keras.layers.BatchNormalization(
+        axis=self.axis, fused=self.config.fused, dtype=self.config.dtype)
+    self.activation = tf.keras.layers.Activation("relu")
+
+    if self.config.init_max_pool:
+      self.max_pool = tf.keras.layers.MaxPooling2D(
+          pool_size=(3, 3),
+          strides=(2, 2),
           padding="SAME",
-          dtype=dtype))
+          data_format=self.config.data_format,
+          dtype=self.config.dtype)
+
+  def call(self, x, training=True):
+    net = x
+    net = self.conv2d(net)
+    net = self.batch_norm(net, training=training)
+    net = self.activation(net)
+
+    if self.config.init_max_pool:
+      net = self.max_pool(net)
+
+    return net
 
-  return model
+
+class FinalBlock(tf.keras.Model):
+  """Final block of RevNet."""
+
+  def __init__(self, config):
+    """Initialization.
+
+    Args:
+      config: tf.contrib.training.HParams object; specifies hyperparameters
+
+    Raises:
+      ValueError: Unsupported data format
+    """
+    super(FinalBlock, self).__init__()
+    self.config = config
+    self.axis = 1 if self.config.data_format == "channels_first" else 3
+
+    f = self.config.filters[-1]  # Number of filters
+    r = functools.reduce(operator.mul, self.config.strides, 1)  # Reduce ratio
+    r *= self.config.init_stride
+    if self.config.init_max_pool:
+      r *= 2
+
+    if self.config.data_format == "channels_first":
+      w, h = self.config.input_shape[1], self.config.input_shape[2]
+      input_shape = (f, w // r, h // r)
+    elif self.config.data_format == "channels_last":
+      w, h = self.config.input_shape[0], self.config.input_shape[1]
+      input_shape = (w // r, h // r, f)
+    else:
+      raise ValueError("Data format should be either `channels_first`"
+                       " or `channels_last`")
+    self.batch_norm = tf.keras.layers.BatchNormalization(
+        axis=self.axis,
+        input_shape=input_shape,
+        fused=self.config.fused,
+        dtype=self.config.dtype)
+    self.activation = tf.keras.layers.Activation("relu")
+    self.global_avg_pool = tf.keras.layers.GlobalAveragePooling2D(
+        data_format=self.config.data_format, dtype=self.config.dtype)
+    self.dense = tf.keras.layers.Dense(
+        self.config.n_classes, dtype=self.config.dtype)
+
+  def call(self, x, training=True):
+    net = x
+    net = self.batch_norm(net, training=training)
+    net = self.activation(net)
+    net = self.global_avg_pool(net)
+    net = self.dense(net)
+
+    return net
diff --git a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
index d74785c8fe1c170ee95172974141c1cfe18b9502..3c6ea63e48308f3d176e8e7d3781b5bf9b1461aa 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/blocks_test.py
@@ -179,7 +179,7 @@ class RevBlockTest(tf.test.TestCase):
       degree = compute_degree(g1, g2)
       self.assertLessEqual(degree, atol)
 
-  def test_backward_grads_and_vars_channels_first(self):
+  def test_backward_grads_channels_first(self):
     """Test `backward` function with `channels_first` data format."""
     if not tf.test.is_gpu_available():
       self.skipTest("GPU not available")
@@ -201,7 +201,8 @@ class RevBlockTest(tf.test.TestCase):
         tape.watch(x)
         y = block(x, training=True)
       # Compute grads from reconstruction
-      dx, dw, vars_ = block.backward_grads_and_vars(x, y, dy, training=True)
+      dx, dw = block.backward_grads(x, y, dy, training=True)
+      vars_ = block.trainable_variables
       # Compute true grads
       grads = tape.gradient(y, [x] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
@@ -224,7 +225,8 @@ class RevBlockTest(tf.test.TestCase):
         tape.watch(x)
         y = block(x, training=True)
       # Compute grads from reconstruction
-      dx, dw, vars_ = block.backward_grads_and_vars(x, y, dy, training=True)
+      dx, dw = block.backward_grads(x, y, dy, training=True)
+      vars_ = block.trainable_variables
       # Compute true grads
       grads = tape.gradient(y, [x] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
@@ -245,7 +247,7 @@ class _ResidualTest(tf.test.TestCase):
     _validate_block_call_channels_first(blocks._Residual, self)
     _validate_block_call_channels_last(blocks._Residual, self)
 
-  def test_backward_grads_and_vars_channels_first(self):
+  def test_backward_grads_channels_first(self):
     """Test `backward_grads` function with `channels_first` data format."""
     if not tf.test.is_gpu_available():
       self.skipTest("GPU not available")
@@ -269,9 +271,8 @@ class _ResidualTest(tf.test.TestCase):
         y = residual(x_true, training=True)
 
       # Gradients computed due to reversibility
-      x, dx, dw, vars_ = residual.backward_grads_and_vars(
-          y, dy=dy, training=True)
-
+      x, dx, dw = residual.backward_grads(y, dy=dy, training=True)
+      vars_ = residual.trainable_variables
       # True gradients computed by the tape
       grads = tape.gradient(y, [x_true] + vars_, output_gradients=dy)
       dx_true, dw_true = grads[0], grads[1:]
diff --git a/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py b/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
index b6d4c35bfd21f9d651c4f059c019cf2e585da8b2..e9672f13e1587c96cea0fc7dd58b66ef256296cd 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/cifar_input.py
@@ -111,6 +111,6 @@ def get_ds_from_tfrecords(data_dir,
     }[split]
     dataset = dataset.shuffle(size)
 
-  dataset = dataset.batch(batch_size)
+  dataset = dataset.batch(batch_size, drop_remainder=True)
 
   return dataset
diff --git a/tensorflow/contrib/eager/python/examples/revnet/config.py b/tensorflow/contrib/eager/python/examples/revnet/config.py
index 3d93fa955a29718fdec52b04500c41f77351dd8d..821a4878c1cfe1aebe3697952059266def0f5817 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/config.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/config.py
@@ -27,17 +27,17 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-tfe = tf.contrib.eager
 
 
 def get_hparams_cifar_38():
   """RevNet-38 configurations for CIFAR-10/CIFAR-100."""
 
   config = tf.contrib.training.HParams()
+  config.add_hparam("num_train_images", 50000)
+  config.add_hparam("num_eval_images", 10000)
   config.add_hparam("init_filters", 32)
   config.add_hparam("init_kernel", 3)
   config.add_hparam("init_stride", 1)
-  config.add_hparam("n_classes", 10)
   config.add_hparam("n_rev_blocks", 3)
   config.add_hparam("n_res", [3, 3, 3])
   config.add_hparam("filters", [32, 64, 112])
@@ -46,7 +46,7 @@ def get_hparams_cifar_38():
   config.add_hparam("bottleneck", False)
   config.add_hparam("fused", True)
   config.add_hparam("init_max_pool", False)
-  if tfe.num_gpus() > 0:
+  if tf.test.is_gpu_available():
     config.add_hparam("input_shape", (3, 32, 32))
     config.add_hparam("data_format", "channels_first")
   else:
@@ -68,9 +68,21 @@ def get_hparams_cifar_38():
   config.add_hparam("div255", True)
   # This is imprecise, when training with validation set,
   # we only have 40k images in training data
-  config.add_hparam("iters_per_epoch", 50000 // config.batch_size)
+  config.add_hparam("iters_per_epoch",
+                    config.num_train_images // config.batch_size)
   config.add_hparam("epochs", config.max_train_iter // config.iters_per_epoch)
 
+  # Customized TPU hyperparameters due to differing batch size caused by
+  # TPU architecture specifics
+  # Suggested batch sizes to reduce overhead from excessive tensor padding
+  # https://cloud.google.com/tpu/docs/troubleshooting
+  config.add_hparam("tpu_batch_size", 1024)
+  config.add_hparam("tpu_eval_batch_size", 1024)
+  config.add_hparam("tpu_iters_per_epoch",
+                    config.num_train_images // config.tpu_batch_size)
+  config.add_hparam("tpu_epochs",
+                    config.max_train_iter // config.tpu_iters_per_epoch)
+
   return config
 
 
@@ -98,15 +110,18 @@ def get_hparams_imagenet_56():
   """RevNet-56 configurations for ImageNet."""
 
   config = tf.contrib.training.HParams()
+  config.add_hparam("n_classes", 1000)
+  config.add_hparam("dataset", "ImageNet")
+  config.add_hparam("num_train_images", 1281167)
+  config.add_hparam("num_eval_images", 50000)
   config.add_hparam("init_filters", 128)
   config.add_hparam("init_kernel", 7)
   config.add_hparam("init_stride", 2)
-  config.add_hparam("n_classes", 1000)
   config.add_hparam("n_rev_blocks", 4)
   config.add_hparam("n_res", [2, 2, 2, 2])
   config.add_hparam("filters", [128, 256, 512, 832])
   config.add_hparam("strides", [1, 2, 2, 2])
-  config.add_hparam("batch_size", 16)
+  config.add_hparam("batch_size", 256)
   config.add_hparam("bottleneck", True)
   config.add_hparam("fused", True)
   config.add_hparam("init_max_pool", True)
@@ -116,6 +131,9 @@ def get_hparams_imagenet_56():
   else:
     config.add_hparam("input_shape", (224, 224, 3))
     config.add_hparam("data_format", "channels_last")
+  # Due to bottleneck residual blocks
+  filters = [f * 4 for f in config.filters]
+  config.filters = filters
 
   # Training details
   config.add_hparam("weight_decay", 1e-4)
@@ -125,16 +143,31 @@ def get_hparams_imagenet_56():
   config.add_hparam("max_train_iter", 600000)
   config.add_hparam("seed", 1234)
   config.add_hparam("shuffle", True)
-  config.add_hparam("log_every", 50)
-  config.add_hparam("save_every", 50)
+  config.add_hparam("log_every", 500)
+  config.add_hparam("save_every", 500)
   config.add_hparam("dtype", tf.float32)
-  config.add_hparam("eval_batch_size", 1000)
+  config.add_hparam("eval_batch_size", 256)
   config.add_hparam("div255", True)
-  # TODO(lxuechen): Update this according to ImageNet data
-  config.add_hparam("iters_per_epoch", 50000 // config.batch_size)
+  config.add_hparam("iters_per_epoch",
+                    config.num_train_images // config.batch_size)
   config.add_hparam("epochs", config.max_train_iter // config.iters_per_epoch)
-  # Due to bottleneck residual blocks
-  filters = [f * 4 for f in config.filters]
-  config.filters = filters
+
+  # Customized TPU hyperparameters due to differing batch size caused by
+  # TPU architecture specifics
+  # Suggested batch sizes to reduce overhead from excessive tensor padding
+  # https://cloud.google.com/tpu/docs/troubleshooting
+  config.add_hparam("tpu_batch_size", 1024)
+  config.add_hparam("tpu_eval_batch_size", 1024)
+  config.add_hparam("tpu_iters_per_epoch",
+                    config.num_train_images // config.tpu_batch_size)
+  config.add_hparam("tpu_epochs",
+                    config.max_train_iter // config.tpu_iters_per_epoch)
+
+  return config
+
+
+def get_hparams_imagenet_104():
+  config = get_hparams_imagenet_56()
+  config.n_res = [2, 2, 11, 2]
 
   return config
diff --git a/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81351b1b14dbf6973e7430c369774339e2dcdd8
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/imagenet_input.py
@@ -0,0 +1,230 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Efficient ImageNet input pipeline using tf.data.Dataset."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+import tensorflow as tf
+
+from tensorflow.contrib.eager.python.examples.revnet import resnet_preprocessing
+
+
+def image_serving_input_fn():
+  """Serving input fn for raw images."""
+
+  def _preprocess_image(image_bytes):
+    """Preprocess a single raw image."""
+    image = resnet_preprocessing.preprocess_image(
+        image_bytes=image_bytes, is_training=False)
+    return image
+
+  image_bytes_list = tf.placeholder(
+      shape=[None],
+      dtype=tf.string,
+  )
+  images = tf.map_fn(
+      _preprocess_image, image_bytes_list, back_prop=False, dtype=tf.float32)
+  return tf.estimator.export.ServingInputReceiver(
+      images, {'image_bytes': image_bytes_list})
+
+
+class ImageNetInput(object):
+  """Generates ImageNet input_fn for training or evaluation.
+
+  The training data is assumed to be in TFRecord format with keys as specified
+  in the dataset_parser below, sharded across 1024 files, named sequentially:
+      train-00000-of-01024
+      train-00001-of-01024
+      ...
+      train-01023-of-01024
+
+  The validation data is in the same format but sharded in 128 files.
+
+  The format of the data required is created by the script at:
+      https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py
+
+  Args:
+    is_training: `bool` for whether the input is for training
+    data_dir: `str` for the directory of the training and validation data;
+        if 'null' (the literal string 'null', not None), then construct a null
+        pipeline, consisting of empty images.
+    use_bfloat16: If True, use bfloat16 precision; else use float32.
+    transpose_input: 'bool' for whether to use the double transpose trick
+    num_cores: `int` for the number of TPU cores
+  """
+
+  def __init__(self, is_training,
+               use_bfloat16,
+               data_dir,
+               num_cores=8,
+               num_parallel_calls=64,
+               image_size=224,
+               transpose_input=False,
+               cache=False):
+    self.image_preprocessing_fn = resnet_preprocessing.preprocess_image
+    self.is_training = is_training
+    self.use_bfloat16 = use_bfloat16
+    self.data_dir = data_dir
+    self.num_cores = num_cores
+    self.num_parallel_calls = num_parallel_calls
+    if self.data_dir == 'null' or self.data_dir == '':
+      self.data_dir = None
+    self.transpose_input = transpose_input
+    self.image_size = image_size
+    self.cache = cache
+
+  def set_shapes(self, batch_size, images, labels):
+    """Statically set the batch_size dimension."""
+    if self.transpose_input:
+      images.set_shape(images.get_shape().merge_with(
+          tf.TensorShape([None, None, None, batch_size])))
+      labels.set_shape(labels.get_shape().merge_with(
+          tf.TensorShape([batch_size])))
+    else:
+      images.set_shape(images.get_shape().merge_with(
+          tf.TensorShape([batch_size, None, None, None])))
+      labels.set_shape(labels.get_shape().merge_with(
+          tf.TensorShape([batch_size])))
+
+    return images, labels
+
+  def dataset_parser(self, value):
+    """Parse an ImageNet record from a serialized string Tensor."""
+    keys_to_features = {
+        'image/encoded': tf.FixedLenFeature((), tf.string, ''),
+        'image/format': tf.FixedLenFeature((), tf.string, 'jpeg'),
+        'image/class/label': tf.FixedLenFeature([], tf.int64, -1),
+        'image/class/text': tf.FixedLenFeature([], tf.string, ''),
+        'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32),
+        'image/object/class/label': tf.VarLenFeature(dtype=tf.int64),
+    }
+
+    parsed = tf.parse_single_example(value, keys_to_features)
+    image_bytes = tf.reshape(parsed['image/encoded'], shape=[])
+
+    image = self.image_preprocessing_fn(
+        image_bytes=image_bytes,
+        is_training=self.is_training,
+        image_size=self.image_size,
+        use_bfloat16=self.use_bfloat16)
+
+    # Subtract one so that labels are in [0, 1000).
+    label = tf.cast(
+        tf.reshape(parsed['image/class/label'], shape=[]), dtype=tf.int32) - 1
+
+    return image, label
+
+  def input_fn(self, params):
+    """Input function which provides a single batch for train or eval.
+
+    Args:
+      params: `dict` of parameters passed from the `TPUEstimator`.
+          `params['batch_size']` is always provided and should be used as the
+          effective batch size.
+
+    Returns:
+      A `tf.data.Dataset` object.
+    """
+    if self.data_dir is None:
+      tf.logging.info('Using fake input.')
+      return self.input_fn_null(params)
+
+    # Retrieves the batch size for the current shard. The # of shards is
+    # computed according to the input pipeline deployment. See
+    # tf.contrib.tpu.RunConfig for details.
+    batch_size = params['batch_size']
+
+    # Shuffle the filenames to ensure better randomization.
+    file_pattern = os.path.join(
+        self.data_dir, 'train-*' if self.is_training else 'validation-*')
+    dataset = tf.data.Dataset.list_files(file_pattern, shuffle=self.is_training)
+
+    if self.is_training and not self.cache:
+      dataset = dataset.repeat()
+
+    def fetch_dataset(filename):
+      buffer_size = 8 * 1024 * 1024  # 8 MiB per file
+      dataset = tf.data.TFRecordDataset(filename, buffer_size=buffer_size)
+      return dataset
+
+    # Read the data from disk in parallel
+    dataset = dataset.apply(
+        tf.contrib.data.parallel_interleave(
+            fetch_dataset, cycle_length=self.num_parallel_calls, sloppy=True))
+    if self.cache:
+      dataset = dataset.cache().apply(
+          tf.contrib.data.shuffle_and_repeat(1024 * 16))
+    else:
+      dataset = dataset.shuffle(1024)
+
+    # Use the fused map-and-batch operation.
+    #
+    # For XLA, we must used fixed shapes. Because we repeat the source training
+    # dataset indefinitely, we can use `drop_remainder=True` to get fixed-size
+    # batches without dropping any training examples.
+    #
+    # When evaluating, `drop_remainder=True` prevents accidentally evaluating
+    # the same image twice by dropping the final batch if it is less than a full
+    # batch size. As long as this validation is done with consistent batch size,
+    # exactly the same images will be used.
+    dataset = dataset.apply(
+        tf.contrib.data.map_and_batch(
+            self.dataset_parser, batch_size=batch_size,
+            num_parallel_batches=self.num_cores, drop_remainder=True))
+
+    # Transpose for performance on TPU
+    if self.transpose_input:
+      dataset = dataset.map(
+          lambda images, labels: (tf.transpose(images, [1, 2, 3, 0]), labels),
+          num_parallel_calls=self.num_cores)
+
+    # Assign static batch size dimension
+    dataset = dataset.map(functools.partial(self.set_shapes, batch_size))
+
+    # Prefetch overlaps in-feed with training
+    dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE)
+    return dataset
+
+  def input_fn_null(self, params):
+    """Input function which provides null (black) images."""
+    batch_size = params['batch_size']
+    dataset = tf.data.Dataset.range(1).repeat().map(self._get_null_input)
+    dataset = dataset.prefetch(batch_size)
+
+    dataset = dataset.apply(
+        tf.contrib.data.batch_and_drop_remainder(batch_size))
+    if self.transpose_input:
+      dataset = dataset.map(
+          lambda images, labels: (tf.transpose(images, [1, 2, 3, 0]), labels),
+          num_parallel_calls=8)
+
+    dataset = dataset.map(functools.partial(self.set_shapes, batch_size))
+
+    dataset = dataset.prefetch(32)     # Prefetch overlaps in-feed with training
+    tf.logging.info('Input dataset: %s', str(dataset))
+    return dataset
+
+  def _get_null_input(self, _):
+    null_image = tf.zeros([224, 224, 3], tf.bfloat16
+                          if self.use_bfloat16 else tf.float32)
+    return (null_image, tf.constant(0, tf.int32))
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index e2f43b03f90ef6db01db1f85943e10ce8c9b582a..b702e91f92220c2a9003a1b82411131332012a9e 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -29,10 +29,18 @@ from tensorflow.contrib.eager.python.examples.revnet import revnet
 tfe = tf.contrib.eager
 
 
+def apply_gradients(optimizer, grads, vars_, global_step=None):
+  """Functional style apply_grads for `tfe.defun`."""
+  optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
+
+
 def main(_):
   """Eager execution workflow with RevNet trained on CIFAR-10."""
-  config = get_config()
-  ds_train, ds_train_one_shot, ds_validation, ds_test = get_datasets(config)
+  tf.enable_eager_execution()
+
+  config = get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+  ds_train, ds_train_one_shot, ds_validation, ds_test = get_datasets(
+      data_dir=FLAGS.data_dir, config=config)
   model = revnet.RevNet(config=config)
   global_step = tf.train.get_or_create_global_step()  # Ensure correct summary
   global_step.assign(1)
@@ -43,6 +51,14 @@ def main(_):
   checkpointer = tf.train.Checkpoint(
       optimizer=optimizer, model=model, optimizer_step=global_step)
 
+  if FLAGS.use_defun:
+    model.call = tfe.defun(model.call)
+    model.compute_gradients = tfe.defun(model.compute_gradients)
+    model.get_moving_stats = tfe.defun(model.get_moving_stats)
+    model.restore_moving_stats = tfe.defun(model.restore_moving_stats)
+    global apply_gradients  # pylint:disable=global-variable-undefined
+    apply_gradients = tfe.defun(apply_gradients)
+
   if FLAGS.train_dir:
     summary_writer = tf.contrib.summary.create_file_writer(FLAGS.train_dir)
     if FLAGS.restore:
@@ -52,46 +68,37 @@ def main(_):
             "with global_step: {}".format(latest_path, global_step.numpy()))
       sys.stdout.flush()
 
-  if FLAGS.manual_grad:
-    print("Using manual gradients.")
-  else:
-    print("Not using manual gradients.")
-  sys.stdout.flush()
-
   for x, y in ds_train:
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_train = ds_train_one_shot.make_one_shot_iterator()
       it_test = ds_test.make_one_shot_iterator()
-      acc_train, loss_train = evaluate(model, it_train)
       acc_test, loss_test = evaluate(model, it_test)
 
       if FLAGS.validate:
+        it_train = ds_train_one_shot.make_one_shot_iterator()
         it_validation = ds_validation.make_one_shot_iterator()
+        acc_train, loss_train = evaluate(model, it_train)
         acc_validation, loss_validation = evaluate(model, it_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
-              "validation set accuracy {:.4f}, loss {:4.f}"
+              "validation set accuracy {:.4f}, loss {:.4f}; "
               "test accuracy {:.4f}, loss {:.4f}".format(
                   global_step.numpy(), acc_train, loss_train, acc_validation,
                   loss_validation, acc_test, loss_test))
       else:
-        print("Iter {}, "
-              "training set accuracy {:.4f}, loss {:.4f}; "
-              "test accuracy {:.4f}, loss {:.4f}".format(
-                  global_step.numpy(), acc_train, loss_train, acc_test,
-                  loss_test))
+        print("Iter {}, test accuracy {:.4f}, loss {:.4f}".format(
+            global_step.numpy(), acc_test, loss_test))
       sys.stdout.flush()
 
       if FLAGS.train_dir:
         with summary_writer.as_default():
           with tf.contrib.summary.always_record_summaries():
-            tf.contrib.summary.scalar("Training accuracy", acc_train)
             tf.contrib.summary.scalar("Test accuracy", acc_test)
-            tf.contrib.summary.scalar("Training loss", loss_train)
             tf.contrib.summary.scalar("Test loss", loss_test)
             if FLAGS.validate:
+              tf.contrib.summary.scalar("Training accuracy", acc_train)
+              tf.contrib.summary.scalar("Training loss", loss_train)
               tf.contrib.summary.scalar("Validation accuracy", acc_validation)
               tf.contrib.summary.scalar("Validation loss", loss_validation)
 
@@ -103,34 +110,38 @@ def main(_):
       sys.stdout.flush()
 
 
-def get_config():
+def get_config(config_name="revnet-38", dataset="cifar-10"):
   """Return configuration."""
-  print("Config: {}".format(FLAGS.config))
+  print("Config: {}".format(config_name))
   sys.stdout.flush()
   config = {
       "revnet-38": config_.get_hparams_cifar_38(),
       "revnet-110": config_.get_hparams_cifar_110(),
       "revnet-164": config_.get_hparams_cifar_164(),
-  }[FLAGS.config]
+  }[config_name]
 
-  if FLAGS.dataset == "cifar-100":
-    config.n_classes = 100
+  if dataset == "cifar-10":
+    config.add_hparam("n_classes", 10)
+    config.add_hparam("dataset", "cifar-10")
+  else:
+    config.add_hparam("n_classes", 100)
+    config.add_hparam("dataset", "cifar-100")
 
   return config
 
 
-def get_datasets(config):
+def get_datasets(data_dir, config):
   """Return dataset."""
-  if FLAGS.data_dir is None:
+  if data_dir is None:
     raise ValueError("No supplied data directory")
-  if not os.path.exists(FLAGS.data_dir):
-    raise ValueError("Data directory {} does not exist".format(FLAGS.data_dir))
-  if FLAGS.dataset not in ["cifar-10", "cifar-100"]:
-    raise ValueError("Unknown dataset {}".format(FLAGS.dataset))
+  if not os.path.exists(data_dir):
+    raise ValueError("Data directory {} does not exist".format(data_dir))
+  if config.dataset not in ["cifar-10", "cifar-100"]:
+    raise ValueError("Unknown dataset {}".format(config.dataset))
 
-  print("Training on {} dataset.".format(FLAGS.dataset))
+  print("Training on {} dataset.".format(config.dataset))
   sys.stdout.flush()
-  data_dir = os.path.join(FLAGS.data_dir, FLAGS.dataset)
+  data_dir = os.path.join(data_dir, config.dataset)
   if FLAGS.validate:
     # 40k Training set
     ds_train = cifar_input.get_ds_from_tfrecords(
@@ -168,7 +179,7 @@ def get_datasets(config):
         prefetch=config.batch_size)
     ds_validation = None
 
-  # Always compute loss and accuracy on whole training and test set
+  # Always compute loss and accuracy on whole test set
   ds_train_one_shot = cifar_input.get_ds_from_tfrecords(
       data_dir=data_dir,
       split="train_all",
@@ -196,19 +207,15 @@ def get_datasets(config):
 
 def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   """Train for one iteration."""
-  if FLAGS.manual_grad:
-    grads, vars_, loss = model.compute_gradients(inputs, labels, training=True)
-    optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
-  else:  # For correctness validation
-    with tf.GradientTape() as tape:
-      logits, _ = model(inputs, training=True)
-      loss = model.compute_loss(logits=logits, labels=labels)
-      tf.logging.info("Logits are placed on device: {}".format(logits.device))
-    grads = tape.gradient(loss, model.trainable_variables)
-    optimizer.apply_gradients(
-        zip(grads, model.trainable_variables), global_step=global_step)
+  logits, saved_hiddens = model(inputs, training=True)
+  values = model.get_moving_stats()
+  grads, loss = model.compute_gradients(saved_hiddens, labels)
+  # Restore moving averages when executing eagerly to avoid updating twice
+  model.restore_moving_stats(values)
+  apply_gradients(
+      optimizer, grads, model.trainable_variables, global_step=global_step)
 
-  return loss.numpy()
+  return logits, loss
 
 
 def evaluate(model, iterator):
@@ -241,16 +248,18 @@ if __name__ == "__main__":
       "validate",
       default=False,
       help="[Optional] Use the validation set or not for hyperparameter search")
-  flags.DEFINE_boolean(
-      "manual_grad",
-      default=False,
-      help="[Optional] Use manual gradient graph to save memory")
   flags.DEFINE_string(
       "dataset",
       default="cifar-10",
       help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
   flags.DEFINE_string(
-      "config", default="revnet-38", help="[Optional] Architecture of network.")
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  flags.DEFINE_boolean(
+      "use_defun",
+      default=False,
+      help="[Optional] Use `tfe.defun` to boost performance.")
   FLAGS = flags.FLAGS
-  tf.enable_eager_execution()
   tf.app.run(main)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21b573c0613c015f64da24127bfbe112ca576be
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator.py
@@ -0,0 +1,200 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimator workflow with RevNet train on CIFAR-10."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl import flags
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.revnet import cifar_input
+from tensorflow.contrib.eager.python.examples.revnet import main as main_
+from tensorflow.contrib.eager.python.examples.revnet import revnet
+
+
+def model_fn(features, labels, mode, params):
+  """Function specifying the model that is required by the `tf.estimator` API.
+
+  Args:
+    features: Input images
+    labels: Labels of images
+    mode: One of `ModeKeys.TRAIN`, `ModeKeys.EVAL` or 'ModeKeys.PREDICT'
+    params: A dictionary of extra parameter that might be passed
+
+  Returns:
+    An instance of `tf.estimator.EstimatorSpec`
+  """
+
+  inputs = features
+  if isinstance(inputs, dict):
+    inputs = features["image"]
+
+  config = params["config"]
+  model = revnet.RevNet(config=config)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.train.piecewise_constant(
+        global_step, config.lr_decay_steps, config.lr_list)
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=config.momentum)
+    logits, saved_hidden = model(inputs, training=True)
+    grads, loss = model.compute_gradients(saved_hidden, labels, training=True)
+    with tf.control_dependencies(model.get_updates_for(inputs)):
+      train_op = optimizer.apply_gradients(
+          zip(grads, model.trainable_variables), global_step=global_step)
+
+    return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
+  else:
+    logits, _ = model(inputs, training=False)
+    predictions = tf.argmax(logits, axis=1)
+    probabilities = tf.nn.softmax(logits)
+
+    if mode == tf.estimator.ModeKeys.EVAL:
+      loss = model.compute_loss(labels=labels, logits=logits)
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          loss=loss,
+          eval_metric_ops={
+              "accuracy":
+                  tf.metrics.accuracy(labels=labels, predictions=predictions)
+          })
+
+    else:  # mode == tf.estimator.ModeKeys.PREDICT
+      result = {
+          "classes": predictions,
+          "probabilities": probabilities,
+      }
+
+      return tf.estimator.EstimatorSpec(
+          mode=mode,
+          predictions=predictions,
+          export_outputs={
+              "classify": tf.estimator.export.PredictOutput(result)
+          })
+
+
+def get_input_fn(config, data_dir, split):
+  """Get the input function that is required by the `tf.estimator` API.
+
+  Args:
+    config: Customized hyperparameters
+    data_dir: Directory where the data is stored
+    split: One of `train`, `validation`, `train_all`, and `test`
+
+  Returns:
+    Input function required by the `tf.estimator` API
+  """
+
+  data_dir = os.path.join(data_dir, config.dataset)
+  # Fix split-dependent hyperparameters
+  if split == "train_all" or split == "train":
+    data_aug = True
+    batch_size = config.batch_size
+    epochs = config.epochs
+    shuffle = True
+    prefetch = config.batch_size
+  else:
+    data_aug = False
+    batch_size = config.eval_batch_size
+    epochs = 1
+    shuffle = False
+    prefetch = config.eval_batch_size
+
+  def input_fn():
+    """Input function required by the `tf.estimator.Estimator` API."""
+    return cifar_input.get_ds_from_tfrecords(
+        data_dir=data_dir,
+        split=split,
+        data_aug=data_aug,
+        batch_size=batch_size,
+        epochs=epochs,
+        shuffle=shuffle,
+        prefetch=prefetch,
+        data_format=config.data_format)
+
+  return input_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  # RevNet specific configuration
+  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+
+  # Estimator specific configuration
+  run_config = tf.estimator.RunConfig(
+      model_dir=FLAGS.train_dir,  # Directory for storing checkpoints
+      tf_random_seed=config.seed,
+      save_summary_steps=config.log_every,
+      save_checkpoints_steps=config.log_every,
+      session_config=None,  # Using default
+      keep_checkpoint_max=100,
+      keep_checkpoint_every_n_hours=10000,  # Using default
+      log_step_count_steps=config.log_every,
+      train_distribute=None  # Default not use distribution strategy
+  )
+
+  # Construct estimator
+  revnet_estimator = tf.estimator.Estimator(
+      model_fn=model_fn,
+      model_dir=FLAGS.train_dir,
+      config=run_config,
+      params={"config": config})
+
+  # Construct input functions
+  train_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="train_all")
+  eval_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="test")
+
+  # Train and evaluate estimator
+  revnet_estimator.train(input_fn=train_input_fn)
+  revnet_estimator.evaluate(input_fn=eval_input_fn)
+
+  if FLAGS.export:
+    input_shape = (None,) + config.input_shape
+    inputs = tf.placeholder(tf.float32, shape=input_shape)
+    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({
+        "image": inputs
+    })
+    revnet_estimator.export_savedmodel(FLAGS.train_dir, input_fn)
+
+
+if __name__ == "__main__":
+  flags.DEFINE_string(
+      "data_dir", default=None, help="Directory to load tfrecords")
+  flags.DEFINE_string(
+      "train_dir",
+      default=None,
+      help="[Optional] Directory to store the training information")
+  flags.DEFINE_string(
+      "dataset",
+      default="cifar-10",
+      help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
+  flags.DEFINE_boolean(
+      "export",
+      default=False,
+      help="[Optional] Export the model for serving if True")
+  flags.DEFINE_string(
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  FLAGS = flags.FLAGS
+  tf.app.run()
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0aad9b11088e72e9027e3ba59c1924ace9ee558
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/main_estimator_tpu.py
@@ -0,0 +1,319 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cloud TPU Estimator workflow with RevNet train on CIFAR-10."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import time
+
+from absl import flags
+import tensorflow as tf
+from tensorflow.contrib.eager.python.examples.revnet import cifar_input
+from tensorflow.contrib.eager.python.examples.revnet import main as main_
+from tensorflow.contrib.eager.python.examples.revnet import revnet
+from tensorflow.contrib.training.python.training import evaluation
+from tensorflow.python.estimator import estimator as estimator_
+
+
+def model_fn(features, labels, mode, params):
+  """Model function required by the `tf.contrib.tpu.TPUEstimator` API.
+
+  Args:
+    features: Input images
+    labels: Labels of images
+    mode: One of `ModeKeys.TRAIN`, `ModeKeys.EVAL` or 'ModeKeys.PREDICT'
+    params: A dictionary of extra parameter that might be passed
+
+  Returns:
+    An instance of `tf.contrib.tpu.TPUEstimatorSpec`
+  """
+
+  inputs = features
+  if isinstance(inputs, dict):
+    inputs = features["image"]
+
+  config = params["config"]
+  model = revnet.RevNet(config=config)
+
+  if mode == tf.estimator.ModeKeys.TRAIN:
+    global_step = tf.train.get_or_create_global_step()
+    learning_rate = tf.train.piecewise_constant(
+        global_step, config.lr_decay_steps, config.lr_list)
+    optimizer = tf.train.MomentumOptimizer(
+        learning_rate, momentum=config.momentum)
+
+    if FLAGS.use_tpu:
+      optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)
+
+    logits, saved_hidden = model(inputs, training=True)
+    grads, loss = model.compute_gradients(saved_hidden, labels, training=True)
+    train_op = optimizer.apply_gradients(
+        zip(grads, model.trainable_variables), global_step=global_step)
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op)
+
+  elif mode == tf.estimator.ModeKeys.EVAL:
+    logits, _ = model(inputs, training=False)
+    loss = model.compute_loss(labels=labels, logits=logits)
+
+    def metric_fn(labels, logits):
+      predictions = tf.argmax(logits, axis=1)
+      accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions)
+      return {
+          "accuracy": accuracy,
+      }
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode, loss=loss, eval_metrics=(metric_fn, [labels, logits]))
+
+  else:  # Predict or export
+    logits, _ = model(inputs, training=False)
+    predictions = {
+        "classes": tf.argmax(logits, axis=1),
+        "probabilities": tf.nn.softmax(logits),
+    }
+
+    return tf.contrib.tpu.TPUEstimatorSpec(
+        mode=mode,
+        predictions=predictions,
+        export_outputs={
+            "classify": tf.estimator.export.PredictOutput(predictions)
+        })
+
+
+def get_input_fn(config, data_dir, split):
+  """Get the input function required by the `tf.contrib.tpu.TPUEstimator` API.
+
+  Args:
+    config: Customized hyperparameters
+    data_dir: Directory where the data is stored
+    split: One of `train`, `validation`, `train_all`, and `test`
+
+  Returns:
+    Input function required by the `tf.contrib.tpu.TPUEstimator` API
+  """
+
+  data_dir = os.path.join(data_dir, config.dataset)
+  # Fix split-dependent hyperparameters
+  if split == "train_all" or split == "train":
+    data_aug = True
+    epochs = config.tpu_epochs
+    shuffle = True
+  else:
+    data_aug = False
+    epochs = 1
+    shuffle = False
+
+  def input_fn(params):
+    """Input function required by the `tf.contrib.tpu.TPUEstimator` API."""
+    batch_size = params["batch_size"]
+    return cifar_input.get_ds_from_tfrecords(
+        data_dir=data_dir,
+        split=split,
+        data_aug=data_aug,
+        batch_size=batch_size,  # per-shard batch size
+        epochs=epochs,
+        shuffle=shuffle,
+        prefetch=batch_size,  # per-shard batch size
+        data_format=config.data_format)
+
+  return input_fn
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  # RevNet specific configuration
+  config = main_.get_config(config_name=FLAGS.config, dataset=FLAGS.dataset)
+
+  if FLAGS.use_tpu:
+    tf.logging.info("Using TPU.")
+    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+  else:
+    tpu_cluster_resolver = None
+
+  # TPU specific configuration
+  tpu_config = tf.contrib.tpu.TPUConfig(
+      # Recommended to be set as number of global steps for next checkpoint
+      iterations_per_loop=FLAGS.iterations_per_loop,
+      num_shards=FLAGS.num_shards)
+
+  # Estimator specific configuration
+  run_config = tf.contrib.tpu.RunConfig(
+      cluster=tpu_cluster_resolver,
+      model_dir=FLAGS.model_dir,
+      session_config=tf.ConfigProto(
+          allow_soft_placement=True, log_device_placement=False),
+      tpu_config=tpu_config,
+  )
+
+  # Construct TPU Estimator
+  estimator = tf.contrib.tpu.TPUEstimator(
+      model_fn=model_fn,
+      use_tpu=FLAGS.use_tpu,
+      train_batch_size=config.tpu_batch_size,
+      eval_batch_size=config.tpu_eval_batch_size,
+      config=run_config,
+      params={"config": config})
+
+  # Construct input functions
+  train_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="train_all")
+  eval_input_fn = get_input_fn(
+      config=config, data_dir=FLAGS.data_dir, split="test")
+
+  # Disabling a range within an else block currently doesn't work
+  # due to https://github.com/PyCQA/pylint/issues/872
+  # pylint: disable=protected-access
+  if FLAGS.mode == "eval":
+    # TPUEstimator.evaluate *requires* a steps argument.
+    # Note that the number of examples used during evaluation is
+    # --eval_steps * --batch_size.
+    # So if you change --batch_size then change --eval_steps too.
+    eval_steps = 10000 // config.tpu_eval_batch_size
+
+    # Run evaluation when there's a new checkpoint
+    for ckpt in evaluation.checkpoints_iterator(
+        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
+      tf.logging.info("Starting to evaluate.")
+      try:
+        start_timestamp = time.time()  # This time will include compilation time
+        eval_results = estimator.evaluate(
+            input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=ckpt)
+        elapsed_time = int(time.time() - start_timestamp)
+        tf.logging.info("Eval results: %s. Elapsed seconds: %d" %
+                        (eval_results, elapsed_time))
+
+        # Terminate eval job when final checkpoint is reached
+        current_step = int(os.path.basename(ckpt).split("-")[1])
+        if current_step >= config.max_train_iter:
+          tf.logging.info(
+              "Evaluation finished after training step %d" % current_step)
+          break
+
+      except tf.errors.NotFoundError:
+        # Since the coordinator is on a different job than the TPU worker,
+        # sometimes the TPU worker does not finish initializing until long after
+        # the CPU job tells it to start evaluating. In this case, the checkpoint
+        # file could have been deleted already.
+        tf.logging.info(
+            "Checkpoint %s no longer exists, skipping checkpoint" % ckpt)
+
+  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
+    current_step = estimator_._load_global_step_from_checkpoint_dir(
+        FLAGS.model_dir)
+    tf.logging.info("Training for %d steps . Current"
+                    " step %d." % (config.max_train_iter, current_step))
+
+    start_timestamp = time.time()  # This time will include compilation time
+    if FLAGS.mode == "train":
+      estimator.train(input_fn=train_input_fn, max_steps=config.max_train_iter)
+    else:
+      eval_steps = 10000 // config.tpu_eval_batch_size
+      assert FLAGS.mode == "train_and_eval"
+      while current_step < config.max_train_iter:
+        # Train for up to steps_per_eval number of steps.
+        # At the end of training, a checkpoint will be written to --model_dir.
+        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
+                              config.max_train_iter)
+        estimator.train(input_fn=train_input_fn, max_steps=next_checkpoint)
+        current_step = next_checkpoint
+
+        # Evaluate the model on the most recent model in --model_dir.
+        # Since evaluation happens in batches of --eval_batch_size, some images
+        # may be consistently excluded modulo the batch size.
+        tf.logging.info("Starting to evaluate.")
+        eval_results = estimator.evaluate(
+            input_fn=eval_input_fn, steps=eval_steps)
+        tf.logging.info("Eval results: %s" % eval_results)
+
+    elapsed_time = int(time.time() - start_timestamp)
+    tf.logging.info("Finished training up to step %d. Elapsed seconds %d." %
+                    (config.max_train_iter, elapsed_time))
+  # pylint: enable=protected-access
+
+
+if __name__ == "__main__":
+  # Cloud TPU Cluster Resolver flags
+  flags.DEFINE_string(
+      "tpu",
+      default=None,
+      help="The Cloud TPU to use for training. This should be either the name "
+      "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+      "url.")
+  flags.DEFINE_string(
+      "tpu_zone",
+      default=None,
+      help="[Optional] GCE zone where the Cloud TPU is located in. If not "
+      "specified, we will attempt to automatically detect the GCE project from "
+      "metadata.")
+  flags.DEFINE_string(
+      "gcp_project",
+      default=None,
+      help="[Optional] Project name for the Cloud TPU-enabled project. If not "
+      "specified, we will attempt to automatically detect the GCE project from "
+      "metadata.")
+
+  # Model specific parameters
+  flags.DEFINE_string(
+      "data_dir", default=None, help="Directory to load tfrecords")
+  flags.DEFINE_string(
+      "model_dir",
+      default=None,
+      help="[Optional] Directory to store the model information")
+  flags.DEFINE_string(
+      "dataset",
+      default="cifar-10",
+      help="[Optional] The dataset used; either `cifar-10` or `cifar-100`")
+  flags.DEFINE_string(
+      "config",
+      default="revnet-38",
+      help="[Optional] Architecture of network. "
+      "Other options include `revnet-110` and `revnet-164`")
+  flags.DEFINE_boolean(
+      "use_tpu", default=True, help="[Optional] Whether to use TPU")
+  flags.DEFINE_integer(
+      "num_shards", default=8, help="Number of shards (TPU chips).")
+  flags.DEFINE_integer(
+      "iterations_per_loop",
+      default=100,
+      help=(
+          "Number of steps to run on TPU before feeding metrics to the CPU."
+          " If the number of iterations in the loop would exceed the number of"
+          " train steps, the loop will exit before reaching"
+          " --iterations_per_loop. The larger this value is, the higher the"
+          " utilization on the TPU."))
+  flags.DEFINE_string(
+      "mode",
+      default="train_and_eval",
+      help="[Optional] Mode to run: train, eval, train_and_eval")
+  flags.DEFINE_integer(
+      "eval_timeout", 60 * 60 * 24,
+      "Maximum seconds between checkpoints before evaluation terminates.")
+  flags.DEFINE_integer(
+      "steps_per_eval",
+      default=1000,
+      help=(
+          "Controls how often evaluation is performed. Since evaluation is"
+          " fairly expensive, it is advised to evaluate as infrequently as"
+          " possible (i.e. up to --train_steps, which evaluates the model only"
+          " after finishing the entire training regime)."))
+  FLAGS = flags.FLAGS
+  tf.app.run()
diff --git a/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py b/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..21a1ab85d46cde11453e1f693cc4aabbbf3c90ed
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/revnet/resnet_preprocessing.py
@@ -0,0 +1,190 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ImageNet preprocessing for ResNet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+IMAGE_SIZE = 224
+CROP_PADDING = 32
+
+
+def distorted_bounding_box_crop(image_bytes,
+                                bbox,
+                                min_object_covered=0.1,
+                                aspect_ratio_range=(0.75, 1.33),
+                                area_range=(0.05, 1.0),
+                                max_attempts=100,
+                                scope=None):
+  """Generates cropped_image using one of the bboxes randomly distorted.
+
+  See `tf.image.sample_distorted_bounding_box` for more documentation.
+
+  Args:
+    image_bytes: `Tensor` of binary image data.
+    bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]`
+        where each coordinate is [0, 1) and the coordinates are arranged
+        as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole
+        image.
+    min_object_covered: An optional `float`. Defaults to `0.1`. The cropped
+        area of the image must contain at least this fraction of any bounding
+        box supplied.
+    aspect_ratio_range: An optional list of `float`s. The cropped area of the
+        image must have an aspect ratio = width / height within this range.
+    area_range: An optional list of `float`s. The cropped area of the image
+        must contain a fraction of the supplied image within in this range.
+    max_attempts: An optional `int`. Number of attempts at generating a cropped
+        region of the image of the specified constraints. After `max_attempts`
+        failures, return the entire image.
+    scope: Optional `str` for name scope.
+  Returns:
+    cropped image `Tensor`
+  """
+  with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]):
+    shape = tf.image.extract_jpeg_shape(image_bytes)
+    sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+        shape,
+        bounding_boxes=bbox,
+        min_object_covered=min_object_covered,
+        aspect_ratio_range=aspect_ratio_range,
+        area_range=area_range,
+        max_attempts=max_attempts,
+        use_image_if_no_bounding_boxes=True)
+    bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+    # Crop the image to the specified bounding box.
+    offset_y, offset_x, _ = tf.unstack(bbox_begin)
+    target_height, target_width, _ = tf.unstack(bbox_size)
+    crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+    image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+
+    return image
+
+
+def _at_least_x_are_equal(a, b, x):
+  """At least `x` of `a` and `b` `Tensors` are equal."""
+  match = tf.equal(a, b)
+  match = tf.cast(match, tf.int32)
+  return tf.greater_equal(tf.reduce_sum(match), x)
+
+
+def _decode_and_random_crop(image_bytes, image_size):
+  """Make a random crop of image_size."""
+  bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4])
+  image = distorted_bounding_box_crop(
+      image_bytes,
+      bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=(3. / 4, 4. / 3.),
+      area_range=(0.08, 1.0),
+      max_attempts=10,
+      scope=None)
+  original_shape = tf.image.extract_jpeg_shape(image_bytes)
+  bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3)
+
+  image = tf.cond(
+      bad,
+      lambda: _decode_and_center_crop(image_bytes, image_size),
+      lambda: tf.image.resize_bicubic([image],  # pylint: disable=g-long-lambda
+                                      [image_size, image_size])[0])
+
+  return image
+
+
+def _decode_and_center_crop(image_bytes, image_size):
+  """Crops to center of image with padding then scales image_size."""
+  shape = tf.image.extract_jpeg_shape(image_bytes)
+  image_height = shape[0]
+  image_width = shape[1]
+
+  padded_center_crop_size = tf.cast(
+      ((image_size / (image_size + CROP_PADDING)) *
+       tf.cast(tf.minimum(image_height, image_width), tf.float32)),
+      tf.int32)
+
+  offset_height = ((image_height - padded_center_crop_size) + 1) // 2
+  offset_width = ((image_width - padded_center_crop_size) + 1) // 2
+  crop_window = tf.stack([offset_height, offset_width,
+                          padded_center_crop_size, padded_center_crop_size])
+  image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3)
+  image = tf.image.resize_bicubic([image], [image_size, image_size])[0]
+
+  return image
+
+
+def _flip(image):
+  """Random horizontal image flip."""
+  image = tf.image.random_flip_left_right(image)
+  return image
+
+
+def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  image = _decode_and_random_crop(image_bytes, image_size)
+  image = _flip(image)
+  image = tf.reshape(image, [image_size, image_size, 3])
+  image = tf.image.convert_image_dtype(
+      image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+  return image
+
+
+def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE):
+  """Preprocesses the given image for evaluation.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  image = _decode_and_center_crop(image_bytes, image_size)
+  image = tf.reshape(image, [image_size, image_size, 3])
+  image = tf.image.convert_image_dtype(
+      image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32)
+  return image
+
+
+def preprocess_image(image_bytes,
+                     is_training=False,
+                     use_bfloat16=False,
+                     image_size=IMAGE_SIZE):
+  """Preprocesses the given image.
+
+  Args:
+    image_bytes: `Tensor` representing an image binary of arbitrary size.
+    is_training: `bool` for whether the preprocessing is for training.
+    use_bfloat16: `bool` for whether to use bfloat16.
+    image_size: image size.
+
+  Returns:
+    A preprocessed image `Tensor`.
+  """
+  if is_training:
+    return preprocess_for_train(image_bytes, use_bfloat16, image_size)
+  else:
+    return preprocess_for_eval(image_bytes, use_bfloat16, image_size)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet.py b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
index af0d20fa729836b12036d5d54a9b5b0b68d719d2..1f2cb14972f0b92d29489adff8f94e790e1ec4ed 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet.py
@@ -24,10 +24,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-import operator
-
-import six
 import tensorflow as tf
 from tensorflow.contrib.eager.python.examples.revnet import blocks
 
@@ -45,71 +41,10 @@ class RevNet(tf.keras.Model):
     self.axis = 1 if config.data_format == "channels_first" else 3
     self.config = config
 
-    self._init_block = self._construct_init_block()
+    self._init_block = blocks.InitBlock(config=self.config)
+    self._final_block = blocks.FinalBlock(config=self.config)
     self._block_list = self._construct_intermediate_blocks()
-    self._final_block = self._construct_final_block()
-
-  def _construct_init_block(self):
-    init_block = tf.keras.Sequential(
-        [
-            tf.keras.layers.Conv2D(
-                filters=self.config.init_filters,
-                kernel_size=self.config.init_kernel,
-                strides=(self.config.init_stride, self.config.init_stride),
-                data_format=self.config.data_format,
-                use_bias=False,
-                padding="SAME",
-                input_shape=self.config.input_shape,
-                dtype=self.config.dtype),
-            tf.keras.layers.BatchNormalization(
-                axis=self.axis,
-                fused=self.config.fused,
-                dtype=self.config.dtype),
-            tf.keras.layers.Activation("relu"),
-        ],
-        name="init")
-    if self.config.init_max_pool:
-      init_block.add(
-          tf.keras.layers.MaxPooling2D(
-              pool_size=(3, 3),
-              strides=(2, 2),
-              padding="SAME",
-              data_format=self.config.data_format,
-              dtype=self.config.dtype))
-    return init_block
-
-  def _construct_final_block(self):
-    f = self.config.filters[-1]  # Number of filters
-    r = functools.reduce(operator.mul, self.config.strides, 1)  # Reduce ratio
-    r *= self.config.init_stride
-    if self.config.init_max_pool:
-      r *= 2
-
-    if self.config.data_format == "channels_first":
-      w, h = self.config.input_shape[1], self.config.input_shape[2]
-      input_shape = (f, w // r, h // r)
-    elif self.config.data_format == "channels_last":
-      w, h = self.config.input_shape[0], self.config.input_shape[1]
-      input_shape = (w // r, h // r, f)
-    else:
-      raise ValueError("Data format should be either `channels_first`"
-                       " or `channels_last`")
-
-    final_block = tf.keras.Sequential(
-        [
-            tf.keras.layers.BatchNormalization(
-                axis=self.axis,
-                input_shape=input_shape,
-                fused=self.config.fused,
-                dtype=self.config.dtype),
-            tf.keras.layers.Activation("relu"),
-            tf.keras.layers.GlobalAveragePooling2D(
-                data_format=self.config.data_format, dtype=self.config.dtype),
-            tf.keras.layers.Dense(
-                self.config.n_classes, dtype=self.config.dtype)
-        ],
-        name="final")
-    return final_block
+    self._moving_average_variables = []
 
   def _construct_intermediate_blocks(self):
     # Precompute input shape after initial block
@@ -193,109 +128,90 @@ class RevNet(tf.keras.Model):
 
     return tf.reduce_mean(cross_ent)
 
-  def compute_gradients(self, inputs, labels, training=True, l2_reg=True):
+  def compute_gradients(self, saved_hidden, labels, training=True, l2_reg=True):
     """Manually computes gradients.
 
-    When eager execution is enabled, this method also SILENTLY updates the
-    running averages of batch normalization when `training` is set to True.
+    This method silently updates the running averages of batch normalization.
 
     Args:
-      inputs: Image tensor, either NHWC or NCHW, conforming to `data_format`
+      saved_hidden: List of hidden states Tensors
       labels: One-hot labels for classification
       training: Use the mini-batch stats in batch norm if set to True
       l2_reg: Apply l2 regularization
 
     Returns:
-      list of tuples each being (grad, var) for optimizer to use
+      A tuple with the first entry being a list of all gradients and the second
+      being the loss
     """
 
-    # Run forward pass to record hidden states; avoid updating running averages
-    vars_and_vals = self.get_moving_stats()
-    _, saved_hidden = self.call(inputs, training=training)
-    self.restore_moving_stats(vars_and_vals)
-
-    grads_all = []
-    vars_all = []
+    def _defunable_pop(l):
+      """Functional style list pop that works with `tfe.defun`."""
+      t, l = l[-1], l[:-1]
+      return t, l
 
-    # Manually backprop through last block
+    # Backprop through last block
     x = saved_hidden[-1]
     with tf.GradientTape() as tape:
-      x = tf.identity(x)
       tape.watch(x)
-      # Running stats updated below
       logits = self._final_block(x, training=training)
       loss = self.compute_loss(logits, labels)
-
     grads_combined = tape.gradient(loss,
                                    [x] + self._final_block.trainable_variables)
-    dy, grads_ = grads_combined[0], grads_combined[1:]
-    grads_all += grads_
-    vars_all += self._final_block.trainable_variables
+    dy, final_grads = grads_combined[0], grads_combined[1:]
 
-    # Manually backprop through intermediate blocks
+    # Backprop through intermediate blocks
+    intermediate_grads = []
     for block in reversed(self._block_list):
-      y = saved_hidden.pop()
+      y, saved_hidden = _defunable_pop(saved_hidden)
       x = saved_hidden[-1]
-      dy, grads, vars_ = block.backward_grads_and_vars(
-          x, y, dy, training=training)
-      grads_all += grads
-      vars_all += vars_
-
-    # Manually backprop through first block
-    saved_hidden.pop()
-    x = saved_hidden.pop()
-    assert not saved_hidden  # Cleared after backprop
+      dy, grads = block.backward_grads(x, y, dy, training=training)
+      intermediate_grads = grads + intermediate_grads
 
+    # Backprop through first block
+    _, saved_hidden = _defunable_pop(saved_hidden)
+    x, saved_hidden = _defunable_pop(saved_hidden)
+    assert not saved_hidden
     with tf.GradientTape() as tape:
-      x = tf.identity(x)
-      # Running stats updated below
       y = self._init_block(x, training=training)
-
-    grads_all += tape.gradient(
+    init_grads = tape.gradient(
         y, self._init_block.trainable_variables, output_gradients=dy)
-    vars_all += self._init_block.trainable_variables
 
-    # Apply weight decay
+    # Ordering match up with `model.trainable_variables`
+    grads_all = init_grads + final_grads + intermediate_grads
     if l2_reg:
-      grads_all = self._apply_weight_decay(grads_all, vars_all)
+      grads_all = self._apply_weight_decay(grads_all)
 
-    return grads_all, vars_all, loss
+    return grads_all, loss
 
-  def _apply_weight_decay(self, grads, vars_):
+  def _apply_weight_decay(self, grads):
     """Update gradients to reflect weight decay."""
-    # Don't decay bias
     return [
         g + self.config.weight_decay * v if v.name.endswith("kernel:0") else g
-        for g, v in zip(grads, vars_)
+        for g, v in zip(grads, self.trainable_variables)
     ]
 
   def get_moving_stats(self):
-    """Get moving averages of batch normalization.
-
-    This is needed to avoid updating the running average twice in one iteration.
-
-    Returns:
-      A dictionary mapping variables for batch normalization moving averages
-      to their current values.
-    """
-    vars_and_vals = {}
-
-    def _is_moving_var(v):
+    """Get moving averages of batch normalization."""
+    device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0"
+    with tf.device(device):
+      return [v.read_value() for v in self.moving_average_variables]
+
+  def restore_moving_stats(self, values):
+    """Restore moving averages of batch normalization."""
+    device = "/gpu:0" if tf.test.is_gpu_available() else "/cpu:0"
+    with tf.device(device):
+      for var_, val in zip(self.moving_average_variables, values):
+        var_.assign(val)
+
+  @property
+  def moving_average_variables(self):
+    """Get all variables that are batch norm moving averages."""
+
+    def _is_moving_avg(v):
       n = v.name
       return n.endswith("moving_mean:0") or n.endswith("moving_variance:0")
 
-    for v in filter(_is_moving_var, self.variables):
-      vars_and_vals[v] = v.read_value()
+    if not self._moving_average_variables:
+      self._moving_average_variables = filter(_is_moving_avg, self.variables)
 
-    return vars_and_vals
-
-  def restore_moving_stats(self, vars_and_vals):
-    """Restore moving averages of batch normalization.
-
-    This is needed to avoid updating the running average twice in one iteration.
-
-    Args:
-      vars_and_vals: The dictionary mapping variables to their previous values.
-    """
-    for var_, val in six.iteritems(vars_and_vals):
-      var_.assign(val)
+    return self._moving_average_variables
diff --git a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
index b0d0a5486db9364e6d2d0dcc09742f36d8db3ba6..84b2ddf0de0739936d458ae1bce832cfbb167d64 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/revnet_test.py
@@ -31,10 +31,13 @@ tfe = tf.contrib.eager
 
 def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   """Train for one iteration."""
-  grads, vars_, loss = model.compute_gradients(inputs, labels, training=True)
-  optimizer.apply_gradients(zip(grads, vars_), global_step=global_step)
+  logits, saved_hidden = model(inputs)
+  grads, loss = model.compute_gradients(
+      saved_hidden=saved_hidden, labels=labels)
+  optimizer.apply_gradients(
+      zip(grads, model.trainable_variables), global_step=global_step)
 
-  return loss
+  return logits, loss
 
 
 class RevNetTest(tf.test.TestCase):
@@ -42,6 +45,8 @@ class RevNetTest(tf.test.TestCase):
   def setUp(self):
     super(RevNetTest, self).setUp()
     config = config_.get_hparams_cifar_38()
+    config.add_hparam("n_classes", 10)
+    config.add_hparam("dataset", "cifar-10")
     # Reconstruction could cause numerical error, use double precision for tests
     config.dtype = tf.float64
     config.fused = False  # Fused batch norm does not support tf.float64
@@ -93,9 +98,10 @@ class RevNetTest(tf.test.TestCase):
 
   def test_compute_gradients(self):
     """Test `compute_gradients` function."""
-    self.model(self.x, training=False)  # Initialize model
-    grads, vars_, loss = self.model.compute_gradients(
-        inputs=self.x, labels=self.t, training=True, l2_reg=True)
+    _, saved_hidden = self.model(self.x)  # Initialize model
+    grads, loss = self.model.compute_gradients(
+        saved_hidden=saved_hidden, labels=self.t)
+    vars_ = self.model.trainable_variables
     self.assertTrue(isinstance(grads, list))
     self.assertTrue(isinstance(vars_, list))
     self.assertEqual(len(grads), len(vars_))
@@ -104,7 +110,7 @@ class RevNetTest(tf.test.TestCase):
 
     # Compare against the true gradient computed by the tape
     with tf.GradientTape() as tape:
-      logits, _ = self.model(self.x, training=True)
+      logits, _ = self.model(self.x)
       loss_true = self.model.compute_loss(logits=logits, labels=self.t)
     grads_true = tape.gradient(loss_true, vars_)
     self.assertAllClose(loss, loss_true)
@@ -119,7 +125,9 @@ class RevNetTest(tf.test.TestCase):
   def test_compute_gradients_defun(self):
     """Test `compute_gradients` function with defun."""
     compute_gradients = tfe.defun(self.model.compute_gradients)
-    grads, vars_, _ = compute_gradients(self.x, self.t, training=True)
+    _, saved_hidden = self.model(self.x)
+    grads, _ = compute_gradients(saved_hidden=saved_hidden, labels=self.t)
+    vars_ = self.model.trainable_variables
     self.assertTrue(isinstance(grads, list))
     self.assertTrue(isinstance(vars_, list))
     self.assertEqual(len(grads), len(vars_))
@@ -131,6 +139,9 @@ class RevNetTest(tf.test.TestCase):
     """Test model training in graph mode."""
     with tf.Graph().as_default():
       config = config_.get_hparams_cifar_38()
+      config.add_hparam("n_classes", 10)
+      config.add_hparam("dataset", "cifar-10")
+
       x = tf.random_normal(
           shape=(self.config.batch_size,) + self.config.input_shape)
       t = tf.random_uniform(
@@ -140,15 +151,11 @@ class RevNetTest(tf.test.TestCase):
           dtype=tf.int32)
       global_step = tf.Variable(0., trainable=False)
       model = revnet.RevNet(config=config)
-      model(x)
-      updates = model.get_updates_for(x)
-
-      x_ = tf.identity(x)
-      grads_all, vars_all, _ = model.compute_gradients(x_, t, training=True)
+      _, saved_hidden = model(x)
+      grads, _ = model.compute_gradients(saved_hidden=saved_hidden, labels=t)
       optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
-      with tf.control_dependencies(updates):
-        train_op = optimizer.apply_gradients(
-            zip(grads_all, vars_all), global_step=global_step)
+      train_op = optimizer.apply_gradients(
+          zip(grads, model.trainable_variables), global_step=global_step)
 
       with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index d64bf5354eb1061cc84c03107f5d27bfd02ef079..15776c694e92825895437a4c1547699f6d9269fb 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -315,7 +315,7 @@ def main(_):
                      FLAGS.hidden_dim, FLAGS.num_layers, FLAGS.dropout,
                      use_cudnn_rnn)
     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
-    checkpoint = tfe.Checkpoint(
+    checkpoint = tf.train.Checkpoint(
         learning_rate=learning_rate, model=model,
         # GradientDescentOptimizer has no state to checkpoint, but noting it
         # here lets us swap in an optimizer that does.
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index efa6ba062631500bd7cd16620ebec23d15b93b62..6efafccd6b93ad58da395e0b2e1e647809af62ad 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -291,8 +291,6 @@ class Metric(checkpointable.CheckpointableBase):
 
 class Mean(Metric):
   """Computes the (weighted) mean of the given values."""
-  # TODO(josh11b): Maybe have a dtype argument that defaults to tf.float64?
-  # Or defaults to type of the input if it is tf.float32, else tf.float64?
 
   def __init__(self, name=None, dtype=dtypes.float64,
                use_global_variables=False):
@@ -377,7 +375,7 @@ class Accuracy(Mean):
         array_ops.shape(labels), array_ops.shape(predictions),
         message="Shapes of labels and predictions are unequal")
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(Accuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -421,7 +419,7 @@ class CategoricalAccuracy(Mean):
     labels = math_ops.argmax(labels, axis=-1)
     predictions = math_ops.argmax(predictions, axis=-1)
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(CategoricalAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -472,7 +470,7 @@ class BinaryAccuracy(Mean):
     predictions = ops.convert_to_tensor(predictions)
     predictions = predictions > self.threshold
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(BinaryAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
@@ -520,7 +518,7 @@ class SparseAccuracy(Mean):
     predictions = math_ops.argmax(predictions, axis=-1)
     labels = math_ops.cast(labels, dtypes.int64)
     matches = math_ops.equal(labels, predictions)
-    matches = math_ops.cast(matches, dtypes.float64)
+    matches = math_ops.cast(matches, self.dtype)
     super(SparseAccuracy, self).call(matches, weights=weights)
     if weights is None:
       return labels, predictions
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index fdaca90fd13576e6ca8a3408aaf528dbc2384b0c..d70930864784b3e48140da27ca33ff13f593e663 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -125,8 +125,8 @@ class Saver(object):
 
     Args:
       var_list: The list of variables that will be saved and restored. Either a
-        list of `tfe.Variable` objects, or a dictionary mapping names to
-        `tfe.Variable` objects.
+        list of `tf.Variable` objects, or a dictionary mapping names to
+        `tf.Variable` objects.
 
     Raises:
       RuntimeError: if invoked when eager execution has not been enabled.
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index ca6430253b67d825290b6a376ba3f29b3ae67577..2f0ab616e40560e21dfe19fffb0010f724e48ecd 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -34,6 +34,7 @@ To use, at program startup, call `tfe.enable_eager_execution()`.
 
 @@run
 @@enable_eager_execution
+@@enable_remote_eager_execution
 
 @@custom_gradient
 
@@ -114,6 +115,7 @@ from tensorflow.python.eager.execution_callbacks import inf_nan_callback
 from tensorflow.python.eager.execution_callbacks import nan_callback
 from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
+from tensorflow.python.framework.ops import enable_eager_execution_internal as enable_remote_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
 from tensorflow.python.framework.test_util import run_all_in_graph_and_eager_modes as run_all_tests_in_graph_and_eager_modes
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 11d40f5982ac81d7f3d32cada3457037280801cb..349f48f7f788b458af2639f7ad4cc4cd904465b4 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -28,7 +28,8 @@ py_library(
         ":multi_head",
         ":replicate_model_fn",
         ":rnn",
-        "//tensorflow/python:util",
+        ":saved_model_estimator",
+        "//tensorflow:tensorflow_py_no_contrib",
     ],
 )
 
@@ -54,22 +55,10 @@ py_test(
     deps = [
         ":baseline",
         ":head",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:session",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -96,11 +85,8 @@ py_test(
     ],
     deps = [
         ":boosted_trees",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
@@ -110,7 +96,7 @@ py_library(
     srcs = ["python/estimator/dnn.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:nn",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:dnn",
     ],
@@ -129,16 +115,11 @@ py_test(
     deps = [
         ":dnn",
         ":head",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:dnn_testing_utils",
         "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -149,7 +130,7 @@ py_library(
     srcs = ["python/estimator/dnn_linear_combined.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:nn",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:dnn_linear_combined",
     ],
@@ -168,18 +149,12 @@ py_test(
     deps = [
         ":dnn_linear_combined",
         ":head",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:dnn_testing_utils",
         "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:linear_testing_utils",
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -192,10 +167,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:util",
@@ -211,18 +183,11 @@ py_test(
     tags = ["notsan"],  # b/62863147
     deps = [
         ":extenders",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/contrib/data/python/ops:dataset_ops",
         "//tensorflow/contrib/predictor",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:linear",
-        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
     ],
 )
@@ -246,21 +211,11 @@ py_test(
     tags = ["notsan"],  # b/62863147
     deps = [
         ":export",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variables",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:tag_constants",
     ],
 )
 
@@ -271,25 +226,12 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/ops/losses",
-        "//tensorflow/python/saved_model:signature_constants",
     ],
 )
 
@@ -300,25 +242,10 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":head",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/ops/losses",
-        "//tensorflow/python/saved_model:signature_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -331,8 +258,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:estimator_py",
     ],
 )
@@ -345,10 +271,7 @@ py_test(
     tags = ["notsan"],
     deps = [
         ":hooks",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:estimator_py",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -377,16 +300,11 @@ py_test(
     deps = [
         ":head",
         ":linear",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_export",
         "//tensorflow/python/estimator:linear_testing_utils",
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -399,8 +317,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:linear",
     ],
@@ -413,9 +330,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":logit_fns",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:session",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:model_fn",
     ],
 )
@@ -427,18 +342,11 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
-        "//tensorflow/python/saved_model:signature_constants",
         "@six_archive//:six",
     ],
 )
@@ -451,15 +359,10 @@ py_test(
     deps = [
         ":head",
         ":multi_head",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:string_ops",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/saved_model:signature_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -472,24 +375,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator:export_output",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:util",
-        "//tensorflow/python/ops/losses",
         "@six_archive//:six",
     ],
 )
@@ -500,6 +389,7 @@ cuda_py_test(
     srcs = ["python/estimator/replicate_model_fn_test.py"],
     additional_deps = [
         "@absl_py//absl/testing:parameterized",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:export_export",
@@ -508,21 +398,6 @@ cuda_py_test(
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:optimizers",
         "//tensorflow/python/estimator:prediction_keys",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python/ops/losses",
-        "//tensorflow/python/saved_model:signature_constants",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         ":replicate_model_fn",
     ],
     tags = [
@@ -538,22 +413,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":extenders",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/contrib/feature_column:feature_column_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/estimator",
         "//tensorflow/python/estimator:head",
         "//tensorflow/python/estimator:optimizers",
-        "//tensorflow/python/feature_column",
         "@six_archive//:six",
     ],
 )
@@ -572,21 +436,10 @@ py_test(
     deps = [
         ":head",
         ":rnn",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/contrib/data",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:numpy_io",
         "//tensorflow/python/estimator:parsing_utils",
-        "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
@@ -597,13 +450,7 @@ py_library(
     srcs = ["python/estimator/early_stopping.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary",
-        "//tensorflow/python:training",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
     ],
 )
@@ -614,8 +461,48 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":early_stopping",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/estimator",
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_library(
+    name = "saved_model_estimator",
+    srcs = ["python/estimator/saved_model_estimator.py"],
+    deps = [
+        ":export",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model",
+    ],
+)
+
+py_test(
+    name = "saved_model_estimator_test",
+    size = "medium",
+    srcs = ["python/estimator/saved_model_estimator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":export",
+        ":saved_model_estimator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 09fcfd66a1f90d5a323b09472c4b7f5b1234ee35..e1453ae1d04ebd8d72f812b51480f0b05f7a5416 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -33,6 +33,8 @@ from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
 from tensorflow.contrib.estimator.python.estimator.replicate_model_fn import *
 from tensorflow.contrib.estimator.python.estimator.rnn import *
+from tensorflow.contrib.estimator.python.estimator.saved_model_estimator import *
+from tensorflow.python.estimator.export.export import *
 
 from tensorflow.python.util.all_util import remove_undocumented
 # pylint: enable=unused-import,line-too-long,wildcard-import
@@ -70,6 +72,9 @@ _allowed_symbols = [
     'stop_if_higher_hook',
     'stop_if_no_increase_hook',
     'stop_if_no_decrease_hook',
+    'build_raw_supervised_input_receiver_fn',
+    'build_supervised_input_receiver_fn_from_input_fn',
+    'SavedModelEstimator'
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
index 43bfcffd790e7b3c716c3f70820851a8819af225..7ed77bcce6f00ed13e9952951800f1017d582f19 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees.py
@@ -50,7 +50,8 @@ class _BoostedTreesEstimator(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesEstimator` instance.
 
     Args:
@@ -89,13 +90,18 @@ class _BoostedTreesEstimator(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     """
     # pylint:disable=protected-access
     # HParams for the model.
     tree_hparams = canned_boosted_trees._TreeHParams(
         n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-        tree_complexity, min_node_weight, center_bias)
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return canned_boosted_trees._bt_model_fn(
@@ -129,7 +135,8 @@ def boosted_trees_classifier_train_in_memory(
     min_node_weight=0.,
     config=None,
     train_hooks=None,
-    center_bias=False):
+    center_bias=False,
+    pruning_mode='none'):
   """Trains a boosted tree classifier with in memory dataset.
 
   Example:
@@ -208,6 +215,11 @@ def boosted_trees_classifier_train_in_memory(
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
   Returns:
     a `BoostedTreesClassifier` instance created with the given arguments and
@@ -228,7 +240,7 @@ def boosted_trees_classifier_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias)
+      tree_complexity, min_node_weight, center_bias, pruning_mode)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
@@ -269,7 +281,8 @@ def boosted_trees_regressor_train_in_memory(
     min_node_weight=0.,
     config=None,
     train_hooks=None,
-    center_bias=False):
+    center_bias=False,
+    pruning_mode='none'):
   """Trains a boosted tree regressor with in memory dataset.
 
   Example:
@@ -341,6 +354,11 @@ def boosted_trees_regressor_train_in_memory(
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+    pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
   Returns:
     a `BoostedTreesClassifier` instance created with the given arguments and
@@ -360,7 +378,7 @@ def boosted_trees_regressor_train_in_memory(
   # HParams for the model.
   tree_hparams = canned_boosted_trees._TreeHParams(
       n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
-      tree_complexity, min_node_weight, center_bias)
+      tree_complexity, min_node_weight, center_bias, pruning_mode)
 
   def _model_fn(features, labels, mode, config):
     return canned_boosted_trees._bt_model_fn(
diff --git a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
index 999c2aa5e28242f996e12da3807a74c6acf31df9..b1581f37509b5dc2bec98942e88c024905f25d93 100644
--- a/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/boosted_trees_test.py
@@ -136,6 +136,49 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     eval_res = est.evaluate(input_fn=input_fn, steps=1)
     self.assertAllClose(eval_res['average_loss'], 0.614642)
 
+  def testTrainAndEvaluateEstimatorWithPrePruning(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5,
+        tree_complexity=0.001,
+        pruning_mode='pre')
+
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
+    # could not grow 2 trees of depth 5 (due to pre-pruning).
+    self._assert_checkpoint(
+        est.model_dir, global_step=21, finalized_trees=0, attempted_layers=21)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 3.83943)
+
+  def testTrainAndEvaluateEstimatorWithPostPruning(self):
+    input_fn = _make_train_input_fn(is_classification=False)
+
+    est = boosted_trees._BoostedTreesEstimator(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=2,
+        head=self._head,
+        max_depth=5,
+        tree_complexity=0.001,
+        pruning_mode='post')
+
+    # It will stop after 10 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(input_fn, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=10, finalized_trees=2, attempted_layers=10)
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.37652)
+
   def testInferEstimator(self):
     train_input_fn = _make_train_input_fn(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -231,6 +274,31 @@ class BoostedTreesEstimatorTest(test_util.TensorFlowTestCase):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testBinaryClassifierTrainInMemoryAndEvalAndInferWithPrePruning(self):
+    train_input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.boosted_trees_classifier_train_in_memory(
+        train_input_fn=train_input_fn,
+        feature_columns=self._feature_columns,
+        n_trees=1,
+        max_depth=5,
+        pruning_mode='pre',
+        tree_complexity=0.01)
+    # We stop actually after 2*depth*n_trees steps (via a hook) because we still
+    # could not grow 1 trees of depth 5 (due to pre-pruning).
+    self._assert_checkpoint(
+        est.model_dir, global_step=11, finalized_trees=0, attempted_layers=11)
+
+    # Check evaluate and predict.
+    eval_res = est.evaluate(input_fn=train_input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    # Validate predictions.
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testBinaryClassifierTrainInMemoryWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=True)
     predict_input_fn = numpy_io.numpy_input_fn(
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping.py b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
index af4855e91e530b4b2815c8039e4d482e8cef485d..3eab21d5acaf26f14a73e7fa8e9c50fffc22fe9c 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping.py
@@ -394,10 +394,11 @@ def _summaries(eval_dir):
   Yields:
     `tensorflow.Event` object read from the event files.
   """
-  for event_file in gfile.Glob(
-      os.path.join(eval_dir, _EVENT_FILE_GLOB_PATTERN)):
-    for event in summary_iterator.summary_iterator(event_file):
-      yield event
+  if gfile.Exists(eval_dir):
+    for event_file in gfile.Glob(
+        os.path.join(eval_dir, _EVENT_FILE_GLOB_PATTERN)):
+      for event in summary_iterator.summary_iterator(event_file):
+        yield event
 
 
 def _get_or_create_stop_var():
diff --git a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py b/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
index b5eee818fa0bce068597a7fb8d99dd86f43d396a..e4bfd4b446b9413bd1627ef6904ff2dc9f1a9120 100644
--- a/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/early_stopping_test.py
@@ -92,6 +92,19 @@ class ReadEvalMetricsTest(test.TestCase):
         },
     }, early_stopping.read_eval_metrics(eval_dir))
 
+  def test_read_eval_metrics_when_no_events(self):
+    eval_dir = tempfile.mkdtemp()
+    self.assertTrue(os.path.exists(eval_dir))
+
+    # No error should be raised when eval directory exists with no event files.
+    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
+
+    os.rmdir(eval_dir)
+    self.assertFalse(os.path.exists(eval_dir))
+
+    # No error should be raised when eval directory does not exist.
+    self.assertEqual({}, early_stopping.read_eval_metrics(eval_dir))
+
 
 class EarlyStoppingHooksTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index c9d86ef4ab89950b0c7b0414ba60d9e0a1cbe476..34f765d56546d3cd10fcde5ac444a221c73602cd 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -943,20 +943,30 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         class_probabilities = array_ops.slice(
             probabilities, begin=begin, size=size)
         class_labels = array_ops.slice(labels, begin=begin, size=size)
-        prob_key = keys.PROBABILITY_MEAN_AT_CLASS % class_id
+        if self._label_vocabulary is None:
+          prob_key = keys.PROBABILITY_MEAN_AT_CLASS % class_id
+        else:
+          prob_key = (
+              keys.PROBABILITY_MEAN_AT_NAME % self._label_vocabulary[class_id])
         metric_ops[head_lib._summary_key(self._name, prob_key)] = (  # pylint:disable=protected-access
             head_lib._predictions_mean(  # pylint:disable=protected-access
                 predictions=class_probabilities,
                 weights=weights,
                 name=prob_key))
-        auc_key = keys.AUC_AT_CLASS % class_id
+        if self._label_vocabulary is None:
+          auc_key = keys.AUC_AT_CLASS % class_id
+        else:
+          auc_key = keys.AUC_AT_NAME % self._label_vocabulary[class_id]
         metric_ops[head_lib._summary_key(self._name, auc_key)] = (  # pylint:disable=protected-access
             head_lib._auc(  # pylint:disable=protected-access
                 labels=class_labels,
                 predictions=class_probabilities,
                 weights=weights,
                 name=auc_key))
-        auc_pr_key = keys.AUC_PR_AT_CLASS % class_id
+        if self._label_vocabulary is None:
+          auc_pr_key = keys.AUC_PR_AT_CLASS % class_id
+        else:
+          auc_pr_key = keys.AUC_PR_AT_NAME % self._label_vocabulary[class_id]
         metric_ops[head_lib._summary_key(self._name, auc_pr_key)] = (  # pylint:disable=protected-access
             head_lib._auc(  # pylint:disable=protected-access
                 labels=class_labels,
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 7b884402d4650636bc9fe053994246aabb9c312d..2d367adb47080a630d1d2ef5ecfd4e8d5d0377d9 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -694,12 +694,14 @@ class MultiLabelHead(test.TestCase):
         # this assert tests that the algorithm remains consistent.
         keys.AUC: 0.3333,
         keys.AUC_PR: 0.7639,
-        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
-        keys.AUC_AT_CLASS % 0: 0.,
-        keys.AUC_PR_AT_CLASS % 0: 1.,
-        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
-        keys.AUC_AT_CLASS % 1: 1.,
-        keys.AUC_PR_AT_CLASS % 1: 1.,
+        keys.PROBABILITY_MEAN_AT_NAME % 'a':
+            np.sum(_sigmoid(logits[:, 0])) / 2.,
+        keys.AUC_AT_NAME % 'a': 0.,
+        keys.AUC_PR_AT_NAME % 'a': 1.,
+        keys.PROBABILITY_MEAN_AT_NAME % 'b':
+            np.sum(_sigmoid(logits[:, 1])) / 2.,
+        keys.AUC_AT_NAME % 'b': 1.,
+        keys.AUC_PR_AT_NAME % 'b': 1.,
     }
 
     self._test_eval(
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
index ddd6aa442f82bad2d4714dbcdc85b20b34773068..caadafdfa6972c141d32a705e62a98d220cace41 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -189,7 +189,7 @@ class InMemoryEvaluatorHook(training.SessionRunHook):
         init_fn=feed_variables, copy_from_scaffold=self._scaffold)
 
     with self._graph.as_default():
-      return self._estimator._evaluate_run(
+      self._estimator._evaluate_run(
           checkpoint_path=None,
           scaffold=scaffold,
           update_op=self._update_op,
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
index 95ae971852ee6dffb6174fc243686721c30ef685..ee88d5ecf50aa15b2faa0f3e136c686b5b0ef62a 100644
--- a/tensorflow/contrib/estimator/python/estimator/hooks_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -102,6 +102,7 @@ class InMemoryEvaluatorHookTest(test.TestCase):
     self.assertTrue(os.path.isdir(estimator.eval_dir()))
     step_keyword_to_value = summary_step_keyword_to_value_mapping(
         estimator.eval_dir())
+
     # 4.5 = sum(range(10))/10
     # before training
     self.assertEqual(4.5, step_keyword_to_value[0]['mean_of_features'])
@@ -110,6 +111,7 @@ class InMemoryEvaluatorHookTest(test.TestCase):
     self.assertEqual(4.5, step_keyword_to_value[8]['mean_of_features'])
     # end
     self.assertEqual(4.5, step_keyword_to_value[10]['mean_of_features'])
+    self.assertEqual(set([0, 4, 8, 10]), set(step_keyword_to_value.keys()))
 
   def test_uses_latest_variable_value(self):
 
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0082f7e550b069c072654e3c3fec8f917a84478
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator.py
@@ -0,0 +1,449 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class that creates an Estimator from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export as export_lib
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import training_util
+
+
+class SavedModelEstimator(estimator_lib.Estimator):
+  """Create an Estimator from a SavedModel.
+
+  Only SavedModels exported with
+  `tf.contrib.estimator.export_all_saved_models()` or
+  `tf.estimator.Estimator.export_savedmodel()` are supported for this class.
+
+  Example with `tf.estimator.DNNClassifier`:
+
+  **Step 1: Create and train DNNClassifier.**
+
+  ```python
+  feature1 = tf.feature_column.embedding_column(
+      tf.feature_column.categorical_column_with_vocabulary_list(
+          key='feature1', vocabulary_list=('green', 'yellow')), dimension=1)
+  feature2 = tf.feature_column.numeric_column(key='feature2', default_value=0.0)
+
+  classifier = tf.estimator.DNNClassifier(
+      hidden_units=[4,2], feature_columns=[feature1, feature2])
+
+  def input_fn():
+    features = {'feature1': tf.constant(['green', 'green', 'yellow']),
+                'feature2': tf.constant([3.5, 4.2, 6.1])}
+    label = tf.constant([1., 0., 0.])
+    return tf.data.Dataset.from_tensors((features, label)).repeat()
+
+  classifier.train(input_fn=input_fn, steps=10)
+  ```
+
+  **Step 2: Export classifier.**
+  First, build functions that specify the expected inputs.
+
+  ```python
+  # During train and evaluation, both the features and labels should be defined.
+  supervised_input_receiver_fn = (
+      tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+          {'feature1': tf.placeholder(dtype=tf.string, shape=[None]),
+           'feature2': tf.placeholder(dtype=tf.float32, shape=[None])},
+          tf.placeholder(dtype=tf.float32, shape=[None])))
+
+  # During predict mode, expect to receive a `tf.Example` proto, so a parsing
+  # function is used.
+  serving_input_receiver_fn = (
+      tf.estimator.export.build_parsing_serving_input_receiver_fn(
+          tf.feature_column.make_parse_example_spec([feature1, feature2])))
+  ```
+
+  Next, export the model as a SavedModel. A timestamped directory will be
+  created (for example `/tmp/export_all/1234567890`).
+
+  ```python
+  # Option 1: Save all modes (train, eval, predict)
+  export_dir = tf.contrib.estimator.export_all_saved_models(
+      classifier, '/tmp/export_all',
+      {tf.estimator.ModeKeys.TRAIN: supervised_input_receiver_fn,
+       tf.estimator.ModeKeys.EVAL: supervised_input_receiver_fn,
+       tf.estimator.ModeKeys.PREDICT: serving_input_receiver_fn})
+
+  # Option 2: Only export predict mode
+  export_dir = classifier.export_savedmodel(
+      '/tmp/export_predict', serving_input_receiver_fn)
+  ```
+
+  **Step 3: Create a SavedModelEstimator from the exported SavedModel.**
+
+  ```python
+  est = tf.contrib.estimator.SavedModelEstimator(export_dir)
+
+  # If all modes were exported, you can immediately evaluate and predict, or
+  # continue training. Otherwise only predict is available.
+  eval_results = est.evaluate(input_fn=input_fn, steps=1)
+  print(eval_results)
+
+  est.train(input_fn=input_fn, steps=20)
+
+  def predict_input_fn():
+    example = tf.train.Example()
+    example.features.feature['feature1'].bytes_list.value.extend(['yellow'])
+    example.features.feature['feature2'].float_list.value.extend([1.])
+    return {'inputs':tf.constant([example.SerializeToString()])}
+
+  predictions = est.predict(predict_input_fn)
+  print(next(predictions))
+  ```
+  """
+
+  def __init__(self, saved_model_dir, model_dir=None):
+    """Initialize a SavedModelEstimator.
+
+    The SavedModelEstimator loads its model function and variable values from
+    the graphs defined in the SavedModel. There is no option to pass in
+    `RunConfig` or `params` arguments, because the model function graph is
+    defined statically in the SavedModel.
+
+    Args:
+      saved_model_dir: Directory containing SavedModel protobuf and subfolders.
+      model_dir: Directory to save new checkpoints during training.
+
+    Raises:
+      NotImplementedError: If a DistributionStrategy is defined in the config.
+        Unless the SavedModelEstimator is subclassed, this shouldn't happen.
+    """
+    checkpoint = estimator_lib._get_saved_model_ckpt(saved_model_dir)  # pylint: disable=protected-access
+    vars_to_warm_start = [name for name, _ in
+                          checkpoint_utils.list_variables(checkpoint)]
+    warm_start_settings = estimator_lib.WarmStartSettings(
+        ckpt_to_initialize_from=checkpoint,
+        vars_to_warm_start=vars_to_warm_start)
+
+    super(SavedModelEstimator, self).__init__(
+        model_fn=self._model_fn_from_saved_model, model_dir=model_dir,
+        warm_start_from=warm_start_settings)
+    if self._distribution is not None:
+      raise NotImplementedError(
+          'SavedModelEstimator currently does not support '
+          'DistributionStrategy.')
+    self.saved_model_dir = saved_model_dir
+    self.saved_model_loader = loader_impl.SavedModelLoader(saved_model_dir)
+    self._available_modes = self._extract_available_modes()
+
+  def _extract_available_modes(self):
+    """Return list of modes found in SavedModel."""
+    available_modes = []
+    logging.info('Checking available modes for SavedModelEstimator.')
+    for mode in [model_fn_lib.ModeKeys.TRAIN, model_fn_lib.ModeKeys.EVAL,
+                 model_fn_lib.ModeKeys.PREDICT]:
+      try:
+        self._get_meta_graph_def_for_mode(mode)
+      except RuntimeError:
+        logging.warning('%s mode not found in SavedModel.' % mode)
+        continue
+
+      if self._get_signature_def_for_mode(mode) is not None:
+        available_modes.append(mode)
+
+    logging.info('Available modes for Estimator: %s' % available_modes)
+    return available_modes
+
+  def _validate_mode(self, mode):
+    """Make sure that mode can be run using the SavedModel."""
+    if mode not in self._available_modes:
+      raise RuntimeError('%s mode is not available in the SavedModel. Use '
+                         'saved_model_cli to check that the Metagraph for this '
+                         'mode has been exported.' % mode)
+
+  def _get_meta_graph_def_for_mode(self, mode):
+    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
+    return self.saved_model_loader.get_meta_graph_def_from_tags(tags)
+
+  def _get_signature_def_for_mode(self, mode):
+    meta_graph_def = self._get_meta_graph_def_for_mode(mode)
+    sig_def_key = (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+                   if mode == model_fn_lib.ModeKeys.PREDICT else mode)
+    if sig_def_key not in meta_graph_def.signature_def:
+      logging.warning('Metagraph for mode %s was found, but SignatureDef with'
+                      ' key \"%s\" is missing.' % (mode, sig_def_key))
+      return None
+    return meta_graph_def.signature_def[sig_def_key]
+
+  def _create_and_assert_global_step(self, graph):
+    # Do nothing here. The global step variable will be created/loaded from the
+    # SavedModel. If a global step variable were created here, the result
+    # will be two duplicate global step variables, causing issues during
+    # the warm-start phase.
+    # Due to the global variable being created in the model function, this may
+    # cause issues when running DistributionStrategy. Thus, DistributionStrategy
+    # is not yet supported with SavedModelEstimator.
+    return None
+
+  def _model_fn_from_saved_model(self, features, labels, mode):
+    """Load a SavedModel graph and return an EstimatorSpec."""
+    # TODO(kathywu): Model function loads placeholders from the graph. Calling
+    # export_all_saved_models creates another placeholder for the inputs, on top
+    # of the original placeholders. There should be a way to avoid this.
+    self._validate_mode(mode)
+
+    g = ops.get_default_graph()
+    if  training_util.get_global_step(g) is not None:
+      raise RuntimeError(
+          'Graph must not contain a global step tensor before the SavedModel is'
+          ' loaded. Please make sure that the input function does not create a '
+          'global step.')
+
+    # Extract SignatureDef for information about the input and output tensors.
+    signature_def = self._get_signature_def_for_mode(mode)
+
+    # Generate input map for replacing the inputs in the SavedModel graph with
+    # the provided features and labels.
+    input_map = _generate_input_map(signature_def, features, labels)
+
+    # Create a list of the names of output tensors. When the graph is loaded,
+    # names of the output tensors may be remapped. This ensures that the correct
+    # tensors are returned in the EstimatorSpec.
+    output_tensor_names = [
+        value.name for value in six.itervalues(signature_def.outputs)]
+
+    # Load the graph. `output_tensors` contains output `Tensors` in the same
+    # same order as the `output_tensor_names` list.
+    tags = model_fn_lib.EXPORT_TAG_MAP[mode]
+    _, output_tensors = self.saved_model_loader.load_graph(
+        g, tags, input_map=input_map, return_elements=output_tensor_names)
+
+    # Create a scaffold from the MetaGraphDef that contains ops to initialize
+    # the graph. This should mirror the steps from _add_meta_graph_for_mode(),
+    # which creates a MetaGraphDef from the EstimatorSpec's scaffold.
+    scaffold = monitored_session.Scaffold(
+        local_init_op=loader_impl._get_main_op_tensor(  # pylint: disable=protected-access
+            self._get_meta_graph_def_for_mode(mode)))
+
+    # Ensure that a global step tensor has been created.
+    global_step_tensor = training_util.get_global_step(g)
+    training_util.assert_global_step(global_step_tensor)
+
+    # Extract values to return in the EstimatorSpec.
+    output_map = dict(zip(output_tensor_names, output_tensors))
+    outputs = {key: output_map[value.name]
+               for key, value in six.iteritems(signature_def.outputs)}
+
+    loss, predictions, metrics = _validate_and_extract_outputs(
+        mode, outputs, signature_def.method_name)
+
+    train_op = ops.get_collection(constants.TRAIN_OP_KEY)
+    if len(train_op) > 1:
+      raise RuntimeError('Multiple ops found in the train_op collection.')
+    train_op = None if not train_op else train_op[0]
+
+    _clear_saved_model_collections()
+    return model_fn_lib.EstimatorSpec(
+        scaffold=scaffold,
+        mode=mode,
+        loss=loss,
+        train_op=train_op,
+        predictions=predictions,
+        eval_metric_ops=metrics)
+
+
+def _clear_saved_model_collections():
+  """Clear collections that are expected empty when exporting a SavedModel.
+
+  The SavedModel builder uses these collections to track ops necessary to
+  restore the graph state. These collections are expected to be empty before
+  MetaGraphs are added to the builder.
+  """
+  del ops.get_collection_ref(constants.ASSETS_KEY)[:]
+  del ops.get_collection_ref(constants.LEGACY_INIT_OP_KEY)[:]
+  del ops.get_collection_ref(constants.MAIN_OP_KEY)[:]
+  del ops.get_collection_ref(constants.TRAIN_OP_KEY)[:]
+
+
+def _generate_input_map(signature_def, features, labels):
+  """Return dict mapping an input tensor name to a feature or label tensor.
+
+  Args:
+    signature_def: SignatureDef loaded from SavedModel
+    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the features to be passed to the model.
+    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the labels to be passed to the model. May be
+      `None`.
+
+  Returns:
+    dict mapping string names of inputs to features or labels tensors
+
+  Raises:
+    ValueError: if SignatureDef inputs are not completely mapped by the input
+      features and labels.
+  """
+  # pylint: disable=protected-access
+  if not isinstance(features, dict):
+    features = {export_lib._SINGLE_FEATURE_DEFAULT_NAME: features}
+  if labels is not None and not isinstance(labels, dict):
+    labels = {export_lib._SINGLE_LABEL_DEFAULT_NAME: labels}
+  # pylint: enable=protected-access
+
+  inputs = signature_def.inputs
+  input_map = {}
+  for key, tensor_info in six.iteritems(inputs):
+    input_name = tensor_info.name
+    if ':' in input_name:
+      input_name = input_name[:input_name.find(':')]
+
+    # When tensors are used as control inputs for operations, their names are
+    # prepended with a '^' character in the GraphDef. To handle possible control
+    # flow edge cases, control input names must be included in the input map.
+    control_dependency_name = '^' + input_name
+
+    if key in features:
+      _check_same_dtype_and_shape(features[key], tensor_info, key)
+      input_map[input_name] = input_map[control_dependency_name] = features[key]
+    elif labels is not None and key in labels:
+      _check_same_dtype_and_shape(labels[key], tensor_info, key)
+      input_map[input_name] = input_map[control_dependency_name] = labels[key]
+    else:
+      raise ValueError(
+          'Key \"%s\" not found in features or labels passed in to the model '
+          'function. All required keys: %s' % (key, inputs.keys()))
+
+  return input_map
+
+
+def _check_same_dtype_and_shape(tensor, tensor_info, name):
+  """Validate that tensor has the same properties as the TensorInfo proto.
+
+  Args:
+    tensor: a `Tensor` object.
+    tensor_info: a `TensorInfo` proto.
+    name: Name of the input (to identify Tensor if an error is raised).
+
+  Raises:
+    ValueError: If the tensor shape or dtype don't match the TensorInfo
+  """
+  dtype_error = (tensor.dtype != dtypes.DType(tensor_info.dtype))
+  shape_error = not tensor.shape.is_compatible_with(tensor_info.tensor_shape)
+
+  if dtype_error or shape_error:
+    msg = 'Tensor shape and/or dtype validation failed for input %s:' % name
+    if dtype_error:
+      msg += ('\n\tExpected dtype: %s, Got: %s'
+              % (dtypes.DType(tensor_info.dtype), tensor.dtype))
+    if shape_error:
+      msg += ('\n\tExpected shape: %s, Got: %s'
+              % (tensor_shape.TensorShape(tensor_info.tensor_shape),
+                 tensor.shape))
+
+    raise ValueError(msg)
+
+
+def _extract_eval_metrics(output_dict):
+  """Return a eval metric dict extracted from the output_dict.
+
+  Eval metrics consist of a value tensor and an update op. Both must be in the
+  passed-in tensor dictionary for an eval metric to be added to the returned
+  dictionary.
+
+  Args:
+    output_dict: a dict that maps strings to tensors.
+
+  Returns:
+    dict mapping strings to (value, update_op) tuples.
+  """
+  # pylint: disable=protected-access
+  metric_ops = {}
+  separator_char = export_output._SupervisedOutput._SEPARATOR_CHAR
+
+  for key, tensor in six.iteritems(output_dict):
+    split_key = key.split(separator_char)
+
+    # The metric name may contain the separator character, so recreate its name.
+    metric_name = separator_char.join(split_key[:-1])
+
+    if split_key[0] == export_output._SupervisedOutput.METRICS_NAME:
+      # If the key ends with the value suffix, and there is a corresponding
+      # key ending with the update_op suffix, then add tensors to metrics dict.
+      if split_key[-1] == export_output._SupervisedOutput.METRIC_VALUE_SUFFIX:
+        update_op = ''.join(
+            [metric_name, separator_char,
+             export_output._SupervisedOutput.METRIC_UPDATE_SUFFIX])
+        if update_op in output_dict:
+          update_op_tensor = output_dict[update_op]
+          metric_ops[metric_name] = (tensor, update_op_tensor)
+
+  # pylint: enable=protected-access
+  return metric_ops
+
+
+def _validate_and_extract_outputs(mode, output_dict, method_name):
+  """Extract values from SignatureDef output dictionary.
+
+  Args:
+    mode: One of the modes enumerated in `tf.estimator.ModeKeys`.
+    output_dict: dict of string SignatureDef keys to `Tensor`.
+    method_name: Method name of the SignatureDef as a string.
+
+  Returns:
+    Tuple of (
+      loss: `Tensor` object,
+      predictions: dictionary mapping string keys to `Tensor` objects,
+      metrics: dictionary mapping string keys to a tuple of two `Tensor` objects
+    )
+
+  Raises:
+    RuntimeError: raised if SignatureDef has an invalid method name for the mode
+  """
+  # pylint: disable=protected-access
+  loss, predictions, metrics = None, None, None
+
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    predictions = output_dict
+  else:
+    # Validate that the SignatureDef's method name matches the expected name for
+    # the given mode.
+    expected_method_name = signature_constants.SUPERVISED_TRAIN_METHOD_NAME
+    if mode == model_fn_lib.ModeKeys.EVAL:
+      expected_method_name = signature_constants.SUPERVISED_EVAL_METHOD_NAME
+    if method_name != expected_method_name:
+      raise RuntimeError(
+          'Invalid SignatureDef method name for mode %s.\n\tExpected: %s\n\t'
+          'Got: %s\nPlease ensure that the SavedModel was exported with '
+          '`tf.contrib.estimator.export_all_saved_models()`.' %
+          (mode, expected_method_name, method_name))
+
+    # Extract loss, metrics and predictions from the output dict.
+    loss = output_dict[export_output._SupervisedOutput.LOSS_NAME]
+    metrics = _extract_eval_metrics(output_dict)
+    predictions = {
+        key: value for key, value in six.iteritems(output_dict)
+        if key.split(export_output._SupervisedOutput._SEPARATOR_CHAR)[0] == (
+            export_output._SupervisedOutput.PREDICTIONS_NAME)}
+
+  # pylint: enable=protected-access
+  return loss, predictions, metrics
diff --git a/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..718da1367ce69285f37269c5631fa0be2b050c97
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/saved_model_estimator_test.py
@@ -0,0 +1,369 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SavedModelEstimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import shutil
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import export as contrib_export
+from tensorflow.contrib.estimator.python.estimator import saved_model_estimator
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import monitored_session
+from tensorflow.python.training import training
+
+
+def dummy_input_fn():
+  return dataset_ops.Dataset.from_tensors((
+      {'x': constant_op.constant([[1], [-2]], dtype=dtypes.int64)},
+      constant_op.constant([[4], [-3]], dtype=dtypes.float32))).repeat()
+
+
+def dummy_input_fn_features_only():
+  return dataset_ops.Dataset.from_tensors(
+      {'x': constant_op.constant([[5], [6]], dtype=dtypes.int64)}).repeat()
+
+
+def dummy_supervised_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[2, 1], name='truth')
+  return export.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+
+def dummy_serving_receiver_fn():
+  feature_spec = {'x': array_ops.placeholder(
+      dtype=dtypes.int64, shape=(2, 1), name='feature_x'),}
+  return export.build_raw_serving_input_receiver_fn(feature_spec)
+
+
+def model_fn_diff_modes(features, labels, mode):
+  _, _ = features, labels
+  v = variables.Variable(21, name='some_var')
+  train_op = None
+  loss = constant_op.constant(104)
+  if mode == model_fn_lib.ModeKeys.TRAIN:
+    loss = constant_op.constant(105)
+    predictions = constant_op.constant([501])
+    train_op = control_flow_ops.group(
+        state_ops.assign_add(training.get_global_step(), 1),
+        state_ops.assign_add(v, 3))
+  elif mode == model_fn_lib.ModeKeys.EVAL:
+    loss = constant_op.constant(106)
+    predictions = constant_op.constant([502])
+  else:
+    loss = constant_op.constant(107)
+    predictions = constant_op.constant([503])
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      loss=loss,
+      train_op=train_op,
+      eval_metric_ops={
+          'abs_err': metrics_lib.mean_absolute_error(
+              constant_op.constant(0), predictions)},
+      predictions=predictions)
+
+
+class SavedModelEstimatorTest(test.TestCase):
+
+  def setUp(self):
+    self.tmpdirs = []
+
+  def tearDown(self):
+    for tmpdir in self.tmpdirs:
+      # gfile.DeleteRecursively fails in the windows cmake test, so use shutil.
+      shutil.rmtree(tmpdir, ignore_errors=True)
+    self.tmpdirs = []
+
+  def _get_tmp_dir(self):
+    tmpdir = tempfile.mkdtemp()
+    self.tmpdirs.append(tmpdir)
+    return tmpdir
+
+  def _export_estimator(self, train=True, evaluate=True, predict=True,
+                        model_fn=model_fn_diff_modes):
+    est = estimator.Estimator(model_fn, self._get_tmp_dir())
+    est.train(input_fn=dummy_input_fn, steps=10)
+
+    input_receiver_fn_map = {}
+    if train:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.TRAIN] = (
+          dummy_supervised_receiver_fn())
+    if evaluate:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.EVAL] = (
+          dummy_supervised_receiver_fn())
+    if predict:
+      input_receiver_fn_map[model_fn_lib.ModeKeys.PREDICT] = (
+          dummy_serving_receiver_fn())
+
+    export_base_path = self._get_tmp_dir()
+    export_dir = contrib_export.export_all_saved_models(
+        est, export_base_path, input_receiver_fn_map)
+    return export_dir
+
+  def test_load_all_modes(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    sme.train(input_fn=dummy_input_fn, steps=1)
+    sme.train(input_fn=dummy_input_fn, steps=2)
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    eval_results = sme.evaluate(dummy_input_fn, steps=5)
+
+    self.assertEqual(13, eval_results['global_step'])
+    self.assertEqual(106, eval_results['loss'])
+    self.assertEqual(502, eval_results['metrics/abs_err'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_load_all_modes_no_train(self):
+    """Ensure that all functions can be used without requiring a ckpt."""
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    eval_results = sme.evaluate(dummy_input_fn, steps=5)
+    self.assertEqual(10, eval_results['global_step'])
+    self.assertEqual(106, eval_results['loss'])
+    self.assertEqual(502, eval_results['metrics/abs_err'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_partial_exported_estimator(self):
+    sme1 = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(train=False, predict=False), self._get_tmp_dir())
+    sme1.evaluate(dummy_input_fn, steps=5)
+    with self.assertRaisesRegexp(RuntimeError, 'train mode is not available'):
+      sme1.train(input_fn=dummy_input_fn, steps=1)
+    with self.assertRaisesRegexp(RuntimeError, 'infer mode is not available'):
+      next(sme1.predict(dummy_input_fn_features_only))
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(evaluate=False), self._get_tmp_dir())
+    sme2.train(input_fn=dummy_input_fn, steps=1)
+    next(sme2.predict(dummy_input_fn_features_only))
+    with self.assertRaisesRegexp(RuntimeError, 'eval mode is not available'):
+      sme2.evaluate(dummy_input_fn, steps=5)
+
+  def test_with_incorrect_input(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+
+    def bad_shape_input_fn():
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([1, 2], dtype=dtypes.int64)},
+          constant_op.constant([1, 2], dtype=dtypes.float32)))
+
+    with self.assertRaisesRegexp(ValueError, 'Expected shape'):
+      sme.train(bad_shape_input_fn, steps=1)
+
+    def bad_dtype_input_fn():
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int32)},
+          constant_op.constant([[1], [1]], dtype=dtypes.int64)))
+
+    with self.assertRaisesRegexp(ValueError, 'Expected dtype'):
+      sme.train(bad_dtype_input_fn, steps=1)
+
+  def test_input_fn_with_global_step(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+
+    def bad_input_fn():
+      training.get_or_create_global_step()
+      return dataset_ops.Dataset.from_tensors((
+          {'x': constant_op.constant([[1], [1]], dtype=dtypes.int64)},
+          constant_op.constant([[1], [1]], dtype=dtypes.float32)))
+
+    with self.assertRaisesRegexp(RuntimeError,
+                                 'Graph must not contain a global step tensor'):
+      sme.train(bad_input_fn, steps=1)
+
+  def test_re_export_saved_model_serving_only(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    sme.train(dummy_input_fn, steps=3)
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+    # Export SavedModel, and test that the variable and prediction values are
+    # the same.
+    sme_export_dir = sme.export_savedmodel(
+        self._get_tmp_dir(), dummy_serving_receiver_fn())
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        sme_export_dir, self._get_tmp_dir())
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+    self.assertEqual(13, sme.get_variable_value('global_step'))
+
+    predictions = next(sme2.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_re_export_saved_model(self):
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(), self._get_tmp_dir())
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 10},
+        sme.evaluate(dummy_input_fn, steps=1))
+
+    sme.train(dummy_input_fn, steps=3)
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
+        sme.evaluate(dummy_input_fn, steps=1))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+    # Export SavedModel for all modes
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: dummy_supervised_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: dummy_supervised_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: dummy_serving_receiver_fn()}
+    sme_export_dir = contrib_export.export_all_saved_models(
+        sme, self._get_tmp_dir(), input_receiver_fn_map)
+
+    sme2 = saved_model_estimator.SavedModelEstimator(
+        sme_export_dir, self._get_tmp_dir())
+    self.assertDictEqual(
+        {'loss': 106, 'metrics/abs_err': 502, 'global_step': 13},
+        sme.evaluate(dummy_input_fn, steps=1))
+    self.assertEqual(60, sme.get_variable_value('some_var'))
+
+    sme.train(dummy_input_fn, steps=7)
+    self.assertEqual(20, sme.get_variable_value('global_step'))
+
+    predictions = next(sme2.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'output': 503}, predictions)
+
+  def test_load_saved_model_from_serving_only(self):
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant([103]),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions=constant_op.constant([502]),
+          export_outputs={'test': export_output.ClassificationOutput(
+              constant_op.constant([[32.]]))})
+
+    est = estimator.Estimator(model_fn, self._get_tmp_dir())
+    est.train(input_fn=dummy_input_fn, steps=10)
+
+    def serving_input_receiver_fn():
+      return export.ServingInputReceiver(
+          {'test-features': constant_op.constant([[1], [1]])},
+          array_ops.placeholder(dtype=dtypes.string))
+
+    export_dir = est.export_savedmodel(
+        self._get_tmp_dir(), serving_input_receiver_fn)
+
+    sme = saved_model_estimator.SavedModelEstimator(
+        export_dir, self._get_tmp_dir())
+
+    def input_fn():
+      return {'inputs': constant_op.constant('someinputstr')}
+
+    prediction = next(sme.predict(input_fn))
+    self.assertDictEqual({'scores': 32}, prediction)
+
+  def test_with_local_init_op(self):
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      v = variables.Variable(21, name='some_var')
+      scaffold = monitored_session.Scaffold(
+          local_init_op=state_ops.assign_add(v, -3).op
+      )
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          scaffold=scaffold,
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          loss=array_ops.identity(v))
+    export_dir = self._export_estimator(predict=False, model_fn=model_fn)
+    sme = saved_model_estimator.SavedModelEstimator(
+        export_dir, self._get_tmp_dir())
+
+    eval_results1 = sme.evaluate(dummy_input_fn, steps=2)
+    self.assertEqual(15, eval_results1['loss'])
+
+    sme.train(dummy_input_fn, steps=1)
+    self.assertEqual(15, sme.get_variable_value('some_var'))
+
+    eval_results2 = sme.evaluate(dummy_input_fn, steps=5)
+    self.assertEqual(12, eval_results2['loss'])
+
+  def test_with_working_input_fn(self):
+    def model_fn(features, labels, mode):
+      loss = None
+      if labels is not None:
+        loss = labels[0][0] + labels[1][0]
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=loss,
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          predictions={'features_0': array_ops.identity([features['x'][0][0]]),
+                       'features_1': array_ops.identity([features['x'][1][0]])})
+
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(model_fn=model_fn), self._get_tmp_dir())
+    eval_results = sme.evaluate(dummy_input_fn, steps=1)
+    self.assertEqual(1, eval_results['loss'])
+
+    predictions = next(sme.predict(dummy_input_fn_features_only))
+    self.assertDictEqual({'features_0': 5, 'features_1': 6}, predictions)
+
+  def test_control_dependency(self):
+    # Control dependencies are saved with "^" appended to the start of the input
+    # name. The input map must include control dependencies as well.
+    def model_fn(features, labels, mode):
+      _ = labels
+      with ops.control_dependencies([features['x']]):
+        loss = features['x'][1][0]
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=loss,
+          train_op=state_ops.assign_add(training.get_global_step(), 1))
+    sme = saved_model_estimator.SavedModelEstimator(
+        self._export_estimator(train=False, predict=False, model_fn=model_fn),
+        self._get_tmp_dir())
+    sme.evaluate(dummy_input_fn, steps=1)  # Should run without error
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/framework/python/ops/variables.py b/tensorflow/contrib/framework/python/ops/variables.py
index e8e318001972934c7d2154bc14744823a3ba09f9..322d5c335e6a77c46c7ce5dd795e21a2d5a1f8f9 100644
--- a/tensorflow/contrib/framework/python/ops/variables.py
+++ b/tensorflow/contrib/framework/python/ops/variables.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import saver as tf_saver
@@ -199,10 +200,20 @@ def global_variable(initial_value,
 
 
 @contrib_add_arg_scope
-def variable(name, shape=None, dtype=None, initializer=None,
-             regularizer=None, trainable=True, collections=None,
-             caching_device=None, device=None,
-             partitioner=None, custom_getter=None, use_resource=None):
+def variable(name,
+             shape=None,
+             dtype=None,
+             initializer=None,
+             regularizer=None,
+             trainable=True,
+             collections=None,
+             caching_device=None,
+             device=None,
+             partitioner=None,
+             custom_getter=None,
+             use_resource=None,
+             synchronization=variables.VariableSynchronization.AUTO,
+             aggregation=variables.VariableAggregation.NONE):
   """Gets an existing variable with these parameters or creates a new one.
 
   Args:
@@ -228,6 +239,15 @@ def variable(name, shape=None, dtype=None, initializer=None,
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
     use_resource: If `True` use a ResourceVariable instead of a Variable.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      @{tf.VariableAggregation}.
 
   Returns:
     The created or existing variable.
@@ -242,21 +262,36 @@ def variable(name, shape=None, dtype=None, initializer=None,
     getter = functools.partial(custom_getter,
                                reuse=variable_scope.get_variable_scope().reuse)
   with ops.device(device or ''):
-    return getter(name, shape=shape, dtype=dtype,
-                  initializer=initializer,
-                  regularizer=regularizer,
-                  trainable=trainable,
-                  collections=collections,
-                  caching_device=caching_device,
-                  partitioner=partitioner,
-                  use_resource=use_resource)
+    return getter(
+        name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        use_resource=use_resource,
+        synchronization=synchronization,
+        aggregation=aggregation)
 
 
 @contrib_add_arg_scope
-def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
-                   regularizer=None, trainable=True, collections=None,
-                   caching_device=None, device=None, partitioner=None,
-                   custom_getter=None, use_resource=None):
+def model_variable(name,
+                   shape=None,
+                   dtype=dtypes.float32,
+                   initializer=None,
+                   regularizer=None,
+                   trainable=True,
+                   collections=None,
+                   caching_device=None,
+                   device=None,
+                   partitioner=None,
+                   custom_getter=None,
+                   use_resource=None,
+                   synchronization=variables.VariableSynchronization.AUTO,
+                   aggregation=variables.VariableAggregation.NONE):
   """Gets an existing model variable with these parameters or creates a new one.
 
   Args:
@@ -283,18 +318,36 @@ def model_variable(name, shape=None, dtype=dtypes.float32, initializer=None,
     custom_getter: Callable that allows overwriting the internal
       get_variable method and has to have the same signature.
     use_resource: If `True` use a ResourceVariable instead of a Variable.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      @{tf.VariableSynchronization}. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      @{tf.VariableAggregation}.
 
   Returns:
     The created or existing variable.
   """
   collections = list(collections or [])
   collections += [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.MODEL_VARIABLES]
-  var = variable(name, shape=shape, dtype=dtype,
-                 initializer=initializer, regularizer=regularizer,
-                 trainable=trainable, collections=collections,
-                 caching_device=caching_device, device=device,
-                 partitioner=partitioner, custom_getter=custom_getter,
-                 use_resource=use_resource)
+  var = variable(
+      name,
+      shape=shape,
+      dtype=dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      device=device,
+      partitioner=partitioner,
+      custom_getter=custom_getter,
+      use_resource=use_resource,
+      synchronization=synchronization,
+      aggregation=aggregation)
   return var
 
 
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
index 4554a3d89a2eb66a572f160cf7a7af30caf03fb2..0ccb4583ab653bc2ef6c5c810c902a9332e82df9 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv2d_bias_activation_op.cc
@@ -443,6 +443,8 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
                                          : dnn::DataLayout::kBatchDepthYX;
   constexpr auto filter_layout = is_int8x4 ? dnn::FilterLayout::kOutputInputYX4
                                            : dnn::FilterLayout::kOutputInputYX;
+  constexpr auto compute_data_format =
+      is_int8x4 ? FORMAT_NCHW_VECT_C : FORMAT_NCHW;
 
   dnn::BatchDescriptor conv_input_desc;
   conv_input_desc.set_count(batch_size)
@@ -529,6 +531,7 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
       batch_size,
       conv_input_depth,
       {{conv_input_rows, conv_input_cols}},
+      compute_data_format,
       output_depth,
       {{filter_rows, filter_cols}},
       // TODO(yangzihao): Add support for arbitrary dilations for fused conv.
diff --git a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
index ba52697679dafc239b1dac5562573b3589877a8c..b9c131a2e91469c52931080d8a5af90247bd16f0 100644
--- a/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
+++ b/tensorflow/contrib/fused_conv/kernels/fused_conv_ops_gpu.h
@@ -29,13 +29,13 @@ namespace tensorflow {
 class FusedConvParameters : public ConvParameters {
  public:
   FusedConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
-                      int64 out_depths, const SpatialArray& filter,
-                      const SpatialArray& dilation, const SpatialArray& stride,
-                      const SpatialArray& padding, DataType dtype,
-                      int device_id, bool has_side_input,
+                      TensorFormat data_format, int64 out_depths,
+                      const SpatialArray& filter, const SpatialArray& dilation,
+                      const SpatialArray& stride, const SpatialArray& padding,
+                      DataType dtype, int device_id, bool has_side_input,
                       ActivationMode activation_mode)
-      : ConvParameters(batch, in_depths, in, out_depths, filter, dilation,
-                       stride, padding, dtype, device_id),
+      : ConvParameters(batch, in_depths, in, data_format, out_depths, filter,
+                       dilation, stride, padding, dtype, device_id),
         activation_mode_(activation_mode),
         has_side_input_(has_side_input) {
     hash_code_ = Hash64Combine(hash_code_, has_side_input);
diff --git a/tensorflow/contrib/gan/BUILD b/tensorflow/contrib/gan/BUILD
index c8c2af49d470ff489ebffd4731a3e47a430c5ee6..7e6cb724853b3a0aaaefc1137373063fb78f5549 100644
--- a/tensorflow/contrib/gan/BUILD
+++ b/tensorflow/contrib/gan/BUILD
@@ -57,6 +57,7 @@ py_library(
 py_test(
     name = "train_test",
     srcs = ["python/train_test.py"],
+    shard_count = 50,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -80,6 +81,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -255,12 +257,15 @@ py_library(
 py_test(
     name = "random_tensor_pool_test",
     srcs = ["python/features/python/random_tensor_pool_test.py"],
+    shard_count = 6,
     srcs_version = "PY2AND3",
     deps = [
         ":random_tensor_pool",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
index 9e4ec59e7098443efc53506a4ba159e84b5c1618..ca2d724b49db25191b5744e10b48c66b6bdeb120 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_impl.py
@@ -36,16 +36,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.util import nest
 
 __all__ = [
     'tensor_pool',
 ]
 
 
-def _to_tuple(x):
-  if isinstance(x, (list, tuple)):
-    return tuple(x)
-  return (x,)
+def _to_list(x):
+  return [x] if isinstance(x, ops.Tensor) else list(x)
 
 
 def tensor_pool(input_values,
@@ -63,8 +62,8 @@ def tensor_pool(input_values,
   `pool_size` = 0 or `pooling_probability` = 0.
 
   Args:
-    input_values: A `Tensor`, or a list or tuple of `Tensor`s from which to read
-      values to be pooled.
+    input_values: An arbitrarily nested structure of `tf.Tensors`, from which to
+      read values to be pooled.
     pool_size: An integer specifying the maximum size of the pool. Defaults to
       50.
     pooling_probability: A float `Tensor` specifying the probability of getting
@@ -72,9 +71,10 @@ def tensor_pool(input_values,
     name: A string prefix for the name scope for all tensorflow ops.
 
   Returns:
-    A `Tensor`, or a list or tuple of `Tensor`s (according to the type ofx
-    `input_values`) which is with given probability either the `input_values` or
-    a randomly chosen sample that was previously inserted in the pool.
+    A nested structure of `Tensor` objects with the same structure as
+    `input_values`. With the given probability, the Tensor values are either the
+    same as in `input_values` or a randomly chosen sample that was previously
+    inserted in the pool.
 
   Raises:
     ValueError: If `pool_size` is negative.
@@ -86,11 +86,10 @@ def tensor_pool(input_values,
     return input_values
 
   original_input_values = input_values
-  input_values = _to_tuple(input_values)
+  input_values = nest.flatten(input_values)
 
-  with ops.name_scope(
-      '{}_pool_queue'.format(name),
-      values=input_values + (pooling_probability,)):
+  with ops.name_scope('{}_pool_queue'.format(name),
+                      values=input_values + [pooling_probability]):
     pool_queue = data_flow_ops.RandomShuffleQueue(
         capacity=pool_size,
         min_after_dequeue=0,
@@ -112,10 +111,10 @@ def tensor_pool(input_values,
     def _get_input_value_pooled():
       enqueue_op = pool_queue.enqueue(input_values)
       with ops.control_dependencies([enqueue_op]):
-        return tuple(array_ops.identity(v) for v in input_values)
+        return [array_ops.identity(v) for v in input_values]
 
     def _get_random_pool_value_and_enqueue_input():
-      dequeue_values = _to_tuple(pool_queue.dequeue())
+      dequeue_values = _to_list(pool_queue.dequeue())
       with ops.control_dependencies(dequeue_values):
         enqueue_op = pool_queue.enqueue(input_values)
         with ops.control_dependencies([enqueue_op]):
@@ -124,7 +123,7 @@ def tensor_pool(input_values,
           return control_flow_ops.cond(prob, lambda: dequeue_values,
                                        lambda: input_values)
 
-    output_values = _to_tuple(control_flow_ops.cond(
+    output_values = _to_list(control_flow_ops.cond(
         pool_queue.size() < pool_size, _get_input_value_pooled,
         _get_random_pool_value_and_enqueue_input))
 
@@ -132,8 +131,4 @@ def tensor_pool(input_values,
     for input_value, output_value in zip(input_values, output_values):
       output_value.set_shape(input_value.shape)
 
-  if isinstance(original_input_values, list):
-    return list(output_values)
-  elif isinstance(original_input_values, tuple):
-    return output_values
-  return output_values[0]
+  return nest.pack_sequence_as(original_input_values, output_values)
diff --git a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
index d8cf549cf71838178c9da01df462d41d81595fe5..08584dcd656e3e7a079a3fa36f44742b5eac1178 100644
--- a/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
+++ b/tensorflow/contrib/gan/python/features/python/random_tensor_pool_test.py
@@ -21,7 +21,9 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.gan.python.features.python.random_tensor_pool_impl import tensor_pool
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -111,6 +113,23 @@ class TensorPoolTest(test.TestCase):
         self.assertEqual(len(outs), len(input_values))
         self.assertEqual(outs[1] - outs[0], 1)
 
+  def test_pool_preserves_shape(self):
+    t = constant_op.constant(1)
+    input_values = [[t, t, t], (t, t), t]
+    output_values = tensor_pool(input_values, pool_size=5)
+    print('stuff: ', output_values)
+    # Overall shape.
+    self.assertIsInstance(output_values, list)
+    self.assertEqual(3, len(output_values))
+    # Shape of first element.
+    self.assertIsInstance(output_values[0], list)
+    self.assertEqual(3, len(output_values[0]))
+    # Shape of second element.
+    self.assertIsInstance(output_values[1], tuple)
+    self.assertEqual(2, len(output_values[1]))
+    # Shape of third element.
+    self.assertIsInstance(output_values[2], ops.Tensor)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index 1ba3a641671c7f2a411a0c5f99228ca16eee1080..d3897483740faafa62befbaf873886139f1482d2 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -949,6 +949,11 @@ def cycle_consistency_loss(data_x,
   * loss = (loss_x2x + loss_y2y) / 2
   where `loss` is the final result.
 
+  For the L1-norm, we follow the original implementation:
+  https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
+  we use L1-norm of pixel-wise error normalized by data size such that
+  `cycle_loss_weight` can be specified independent of image size.
+
   See https://arxiv.org/abs/1703.10593 for more details.
 
   Args:
@@ -965,19 +970,12 @@ def cycle_consistency_loss(data_x,
     A scalar `Tensor` of cycle consistency loss.
   """
 
-  def _partial_cycle_consistency_loss(data, reconstructed_data):
-    # Following the original implementation
-    # https://github.com/junyanz/CycleGAN/blob/master/models/cycle_gan_model.lua
-    # use L1-norm of pixel-wise error normalized by data size so that
-    # `cycle_loss_weight` can be specified independent of image size.
-    return math_ops.reduce_mean(math_ops.abs(data - reconstructed_data))
-
   with ops.name_scope(
       scope,
       'cycle_consistency_loss',
       values=[data_x, reconstructed_data_x, data_y, reconstructed_data_y]):
-    loss_x2x = _partial_cycle_consistency_loss(data_x, reconstructed_data_x)
-    loss_y2y = _partial_cycle_consistency_loss(data_y, reconstructed_data_y)
+    loss_x2x = losses.absolute_difference(data_x, reconstructed_data_x)
+    loss_y2y = losses.absolute_difference(data_y, reconstructed_data_y)
     loss = (loss_x2x + loss_y2y) / 2.0
     if add_summaries:
       summary.scalar('cycle_consistency_loss_x2x', loss_x2x)
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 49d932733306d1c9c7252f3cfc3d8bbe5f123bf6..df603d1f183ee65893d6933ed77c52f52e5181e9 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -514,33 +514,42 @@ def _tensor_pool_adjusted_model(model, tensor_pool_fn):
   Raises:
     ValueError: If tensor pool does not support the `model`.
   """
-  if tensor_pool_fn is None:
-    return model
-
-  pooled_generated_data, pooled_generator_inputs = tensor_pool_fn(
-      (model.generated_data, model.generator_inputs))
-
   if isinstance(model, namedtuples.GANModel):
+    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
+        (model.generator_inputs, model.generated_data))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
       dis_gen_outputs = model.discriminator_fn(pooled_generated_data,
                                                pooled_generator_inputs)
-    return model._replace(discriminator_gen_outputs=dis_gen_outputs)
+    return model._replace(
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        discriminator_gen_outputs=dis_gen_outputs)
   elif isinstance(model, namedtuples.ACGANModel):
+    pooled_generator_inputs, pooled_generated_data = tensor_pool_fn(
+        (model.generator_inputs, model.generated_data))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (dis_pooled_gen_outputs,
-       dis_pooled_gen_classification_logits) = model.discriminator_fn(
+      (pooled_discriminator_gen_outputs,
+       pooled_discriminator_gen_classification_logits) = model.discriminator_fn(
            pooled_generated_data, pooled_generator_inputs)
     return model._replace(
-        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
         discriminator_gen_classification_logits=
-        dis_pooled_gen_classification_logits)
+        pooled_discriminator_gen_classification_logits)
   elif isinstance(model, namedtuples.InfoGANModel):
+    pooled_generator_inputs, pooled_generated_data, pooled_structured_input = (
+        tensor_pool_fn((model.generator_inputs, model.generated_data,
+                        model.structured_generator_inputs)))
     with variable_scope.variable_scope(model.discriminator_scope, reuse=True):
-      (dis_pooled_gen_outputs,
+      (pooled_discriminator_gen_outputs,
        pooled_predicted_distributions) = model.discriminator_and_aux_fn(
            pooled_generated_data, pooled_generator_inputs)
     return model._replace(
-        discriminator_gen_outputs=dis_pooled_gen_outputs,
+        generator_inputs=pooled_generator_inputs,
+        generated_data=pooled_generated_data,
+        structured_generator_inputs=pooled_structured_input,
+        discriminator_gen_outputs=pooled_discriminator_gen_outputs,
         predicted_distributions=pooled_predicted_distributions)
   else:
     raise ValueError('Tensor pool does not support `model`: %s.' % type(model))
@@ -632,33 +641,38 @@ def gan_loss(
         'is provided, `model` must be an `ACGANModel`. Instead, was %s.' %
         type(model))
 
+  # Optionally create pooled model.
+  pooled_model = (_tensor_pool_adjusted_model(model, tensor_pool_fn) if
+                  tensor_pool_fn else model)
+
   # Create standard losses.
   gen_loss = generator_loss_fn(model, add_summaries=add_summaries)
-  dis_loss = discriminator_loss_fn(
-      _tensor_pool_adjusted_model(model, tensor_pool_fn),
-      add_summaries=add_summaries)
+  dis_loss = discriminator_loss_fn(pooled_model, add_summaries=add_summaries)
 
   # Add optional extra losses.
   if _use_aux_loss(gradient_penalty_weight):
     gp_loss = tfgan_losses.wasserstein_gradient_penalty(
-        model,
+        pooled_model,
         epsilon=gradient_penalty_epsilon,
         target=gradient_penalty_target,
         one_sided=gradient_penalty_one_sided,
         add_summaries=add_summaries)
     dis_loss += gradient_penalty_weight * gp_loss
   if _use_aux_loss(mutual_information_penalty_weight):
-    info_loss = tfgan_losses.mutual_information_penalty(
+    gen_info_loss = tfgan_losses.mutual_information_penalty(
         model, add_summaries=add_summaries)
-    dis_loss += mutual_information_penalty_weight * info_loss
-    gen_loss += mutual_information_penalty_weight * info_loss
+    dis_info_loss = (gen_info_loss if tensor_pool_fn is None else
+                     tfgan_losses.mutual_information_penalty(
+                         pooled_model, add_summaries=add_summaries))
+    gen_loss += mutual_information_penalty_weight * gen_info_loss
+    dis_loss += mutual_information_penalty_weight * dis_info_loss
   if _use_aux_loss(aux_cond_generator_weight):
     ac_gen_loss = tfgan_losses.acgan_generator_loss(
         model, add_summaries=add_summaries)
     gen_loss += aux_cond_generator_weight * ac_gen_loss
   if _use_aux_loss(aux_cond_discriminator_weight):
     ac_disc_loss = tfgan_losses.acgan_discriminator_loss(
-        model, add_summaries=add_summaries)
+        pooled_model, add_summaries=add_summaries)
     dis_loss += aux_cond_discriminator_weight * ac_disc_loss
   # Gathers auxiliary losses.
   if model.generator_scope:
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 93a12af9441348d39c2707f4deb6c42a00498bd0..df8e0041a9e32409166db4bb089210cdd63d594f 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib import layers
@@ -113,6 +114,12 @@ def stargan_generator_model(inputs, _):
   return variable_scope.get_variable('dummy_g', initializer=0.5) * inputs
 
 
+class StarGANGenerator(object):
+
+  def __call__(self, inputs, _):
+    return stargan_generator_model(inputs, _)
+
+
 def stargan_discriminator_model(inputs, num_domains):
   """Differentiable dummy discriminator for StarGAN."""
 
@@ -129,6 +136,12 @@ def stargan_discriminator_model(inputs, num_domains):
   return output_src, output_cls
 
 
+class StarGANDiscriminator(object):
+
+  def __call__(self, inputs, num_domains):
+    return stargan_discriminator_model(inputs, num_domains)
+
+
 def get_gan_model():
   # TODO(joelshor): Find a better way of creating a variable scope.
   with variable_scope.variable_scope('generator') as gen_scope:
@@ -271,63 +284,75 @@ def create_callable_cyclegan_model():
       data_y=array_ops.ones([1, 2]))
 
 
-def get_sync_optimizer():
-  return sync_replicas_optimizer.SyncReplicasOptimizer(
-      gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
-      replicas_to_aggregate=1)
+def get_stargan_model():
+  """Similar to get_gan_model()."""
+  # TODO(joelshor): Find a better way of creating a variable scope.
+  with variable_scope.variable_scope('generator') as gen_scope:
+    pass
+  with variable_scope.variable_scope('discriminator') as dis_scope:
+    pass
+  return namedtuples.StarGANModel(
+      input_data=array_ops.ones([1, 2, 2, 3]),
+      input_data_domain_label=array_ops.ones([1, 2]),
+      generated_data=array_ops.ones([1, 2, 2, 3]),
+      generated_data_domain_target=array_ops.ones([1, 2]),
+      reconstructed_data=array_ops.ones([1, 2, 2, 3]),
+      discriminator_input_data_source_predication=array_ops.ones([1]),
+      discriminator_generated_data_source_predication=array_ops.ones([1]),
+      discriminator_input_data_domain_predication=array_ops.ones([1, 2]),
+      discriminator_generated_data_domain_predication=array_ops.ones([1, 2]),
+      generator_variables=None,
+      generator_scope=gen_scope,
+      generator_fn=stargan_generator_model,
+      discriminator_variables=None,
+      discriminator_scope=dis_scope,
+      discriminator_fn=stargan_discriminator_model)
 
 
-def get_tensor_pool_fn(pool_size):
+def get_callable_stargan_model():
+  model = get_stargan_model()
+  return model._replace(
+      generator_fn=StarGANGenerator(), discriminator_fn=StarGANDiscriminator())
 
-  def tensor_pool_fn_impl(input_values):
-    return random_tensor_pool.tensor_pool(input_values, pool_size=pool_size)
 
-  return tensor_pool_fn_impl
+def create_stargan_model():
+  return train.stargan_model(
+      stargan_generator_model, stargan_discriminator_model,
+      array_ops.ones([1, 2, 2, 3]), array_ops.ones([1, 2]))
 
 
-def get_tensor_pool_fn_for_infogan(pool_size):
+def create_callable_stargan_model():
+  return train.stargan_model(StarGANGenerator(), StarGANDiscriminator(),
+                             array_ops.ones([1, 2, 2, 3]),
+                             array_ops.ones([1, 2]))
 
-  def tensor_pool_fn_impl(input_values):
-    generated_data, generator_inputs = input_values
-    output_values = random_tensor_pool.tensor_pool(
-        [generated_data] + generator_inputs, pool_size=pool_size)
-    return output_values[0], output_values[1:]
 
-  return tensor_pool_fn_impl
+def get_sync_optimizer():
+  return sync_replicas_optimizer.SyncReplicasOptimizer(
+      gradient_descent.GradientDescentOptimizer(learning_rate=1.0),
+      replicas_to_aggregate=1)
 
 
-class GANModelTest(test.TestCase):
+class GANModelTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_model`."""
 
-  def _test_output_type_helper(self, create_fn, tuple_type):
-    self.assertTrue(isinstance(create_fn(), tuple_type))
-
-  def test_output_type_gan(self):
-    self._test_output_type_helper(get_gan_model, namedtuples.GANModel)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(get_callable_gan_model, namedtuples.GANModel)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(get_infogan_model, namedtuples.InfoGANModel)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(get_callable_infogan_model,
-                                  namedtuples.InfoGANModel)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(get_acgan_model, namedtuples.ACGANModel)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(get_callable_acgan_model,
-                                  namedtuples.ACGANModel)
-
-  def test_output_type_cyclegan(self):
-    self._test_output_type_helper(get_cyclegan_model, namedtuples.CycleGANModel)
-
-  def test_output_type_callable_cyclegan(self):
-    self._test_output_type_helper(get_callable_cyclegan_model,
-                                  namedtuples.CycleGANModel)
+  @parameterized.named_parameters(
+      ('gan', get_gan_model, namedtuples.GANModel),
+      ('callable_gan', get_callable_gan_model, namedtuples.GANModel),
+      ('infogan', get_infogan_model, namedtuples.InfoGANModel),
+      ('callable_infogan', get_callable_infogan_model,
+       namedtuples.InfoGANModel),
+      ('acgan', get_acgan_model, namedtuples.ACGANModel),
+      ('callable_acgan', get_callable_acgan_model, namedtuples.ACGANModel),
+      ('cyclegan', get_cyclegan_model, namedtuples.CycleGANModel),
+      ('callable_cyclegan', get_callable_cyclegan_model,
+       namedtuples.CycleGANModel),
+      ('stargan', get_stargan_model, namedtuples.StarGANModel),
+      ('callabel_stargan', get_callable_stargan_model, namedtuples.StarGANModel)
+  )
+  def test_output_type(self, create_fn, expected_tuple_type):
+    """Test that output type is as expected."""
+    self.assertIsInstance(create_fn(), expected_tuple_type)
 
   def test_no_shape_check(self):
 
@@ -357,7 +382,6 @@ class StarGANModelTest(test.TestCase):
 
   @staticmethod
   def create_input_and_label_tensor(batch_size, img_size, c_size, num_domains):
-
     input_tensor_list = []
     label_tensor_list = []
     for _ in range(num_domains):
@@ -369,7 +393,6 @@ class StarGANModelTest(test.TestCase):
     return input_tensor_list, label_tensor_list
 
   def test_generate_stargan_random_domain_target(self):
-
     batch_size = 8
     domain_numbers = 3
 
@@ -384,7 +407,6 @@ class StarGANModelTest(test.TestCase):
         self.assertEqual(1, np.max(target))
 
   def test_stargan_model_output_type(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -408,7 +430,6 @@ class StarGANModelTest(test.TestCase):
     self.assertTrue(callable(model.generator_fn))
 
   def test_stargan_model_generator_output(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -439,7 +460,6 @@ class StarGANModelTest(test.TestCase):
           reconstructed_data.shape)
 
   def test_stargan_model_discriminator_output(self):
-
     batch_size = 2
     img_size = 16
     c_size = 3
@@ -484,53 +504,55 @@ class StarGANModelTest(test.TestCase):
                             disc_gen_label.shape)
 
 
-class GANLossTest(test.TestCase):
+class GANLossTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_loss`."""
 
-  # Test output type.
-  def _test_output_type_helper(self, get_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', get_gan_model),
+      ('callable_gan', get_callable_gan_model),
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+      ('acgan', get_acgan_model),
+      ('callable_acgan', get_callable_acgan_model),
+  )
+  def test_output_type(self, get_gan_model_fn):
+    """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
-
-  def test_output_type_gan(self):
-    self._test_output_type_helper(get_gan_model)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(get_callable_gan_model)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(get_infogan_model)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(get_callable_infogan_model)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(get_acgan_model)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(get_callable_acgan_model)
-
-  def test_output_type_cyclegan(self):
-    loss = train.cyclegan_loss(create_cyclegan_model(), add_summaries=True)
-    self.assertIsInstance(loss, namedtuples.CycleGANLoss)
+    self.assertIsInstance(loss, namedtuples.GANLoss)
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
-  def test_output_type_callable_cyclegan(self):
-    loss = train.cyclegan_loss(
-        create_callable_cyclegan_model(), add_summaries=True)
+  @parameterized.named_parameters(
+      ('cyclegan', create_cyclegan_model),
+      ('callable_cyclegan', create_callable_cyclegan_model),
+  )
+  def test_cyclegan_output_type(self, get_gan_model_fn):
+    loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
     self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
 
-  # Test gradient penalty option.
-  def _test_grad_penalty_helper(self, create_gan_model_fn, one_sided=False):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('gan_one_sided', create_gan_model, True),
+      ('callable_gan', create_callable_gan_model, False),
+      ('callable_gan_one_sided', create_callable_gan_model, True),
+      ('infogan', create_infogan_model, False),
+      ('infogan_one_sided', create_infogan_model, True),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('callable_infogan_one_sided', create_callable_infogan_model, True),
+      ('acgan', create_acgan_model, False),
+      ('acgan_one_sided', create_acgan_model, True),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('callable_acgan_one_sided', create_callable_acgan_model, True),
+  )
+  def test_grad_penalty(self, create_gan_model_fn, one_sided):
+    """Test gradient penalty option."""
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     loss_gp = train.gan_loss(
         model,
         gradient_penalty_weight=1.0,
         gradient_penalty_one_sided=one_sided)
-    self.assertTrue(isinstance(loss_gp, namedtuples.GANLoss))
+    self.assertIsInstance(loss_gp, namedtuples.GANLoss)
 
     # Check values.
     with self.test_session(use_gpu=True) as sess:
@@ -541,59 +563,28 @@ class GANLossTest(test.TestCase):
           [loss.discriminator_loss, loss_gp.discriminator_loss])
 
     self.assertEqual(loss_gen_np, loss_gen_gp_np)
-    self.assertTrue(loss_dis_np < loss_dis_gp_np)
-
-  def test_grad_penalty_gan(self):
-    self._test_grad_penalty_helper(create_gan_model)
-
-  def test_grad_penalty_callable_gan(self):
-    self._test_grad_penalty_helper(create_callable_gan_model)
-
-  def test_grad_penalty_infogan(self):
-    self._test_grad_penalty_helper(create_infogan_model)
-
-  def test_grad_penalty_callable_infogan(self):
-    self._test_grad_penalty_helper(create_callable_infogan_model)
-
-  def test_grad_penalty_acgan(self):
-    self._test_grad_penalty_helper(create_acgan_model)
-
-  def test_grad_penalty_callable_acgan(self):
-    self._test_grad_penalty_helper(create_callable_acgan_model)
-
-  def test_grad_penalty_one_sided_gan(self):
-    self._test_grad_penalty_helper(create_gan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_gan(self):
-    self._test_grad_penalty_helper(create_callable_gan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_infogan(self):
-    self._test_grad_penalty_helper(create_infogan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_infogan(self):
-    self._test_grad_penalty_helper(
-        create_callable_infogan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_acgan(self):
-    self._test_grad_penalty_helper(create_acgan_model, one_sided=True)
-
-  def test_grad_penalty_one_sided_callable_acgan(self):
-    self._test_grad_penalty_helper(create_callable_acgan_model, one_sided=True)
-
-  # Test mutual information penalty option.
-  def _test_mutual_info_penalty_helper(self, create_gan_model_fn):
+    self.assertLess(loss_dis_np, loss_dis_gp_np)
+
+  @parameterized.named_parameters(
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+  )
+  def test_mutual_info_penalty(self, create_gan_model_fn):
+    """Test mutual information penalty option."""
     train.gan_loss(
         create_gan_model_fn(),
         mutual_information_penalty_weight=constant_op.constant(1.0))
 
-  def test_mutual_info_penalty_infogan(self):
-    self._test_mutual_info_penalty_helper(get_infogan_model)
-
-  def test_mutual_info_penalty_callable_infogan(self):
-    self._test_mutual_info_penalty_helper(get_callable_infogan_model)
-
-  # Test regularization loss.
-  def _test_regularization_helper(self, get_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', get_gan_model),
+      ('callable_gan', get_callable_gan_model),
+      ('infogan', get_infogan_model),
+      ('callable_infogan', get_callable_infogan_model),
+      ('acgan', get_acgan_model),
+      ('callable_acgan', get_callable_acgan_model),
+  )
+  def test_regularization_helper(self, get_gan_model_fn):
+    """Test regularization loss."""
     # Evaluate losses without regularization.
     no_reg_loss = train.gan_loss(get_gan_model_fn())
     with self.test_session(use_gpu=True):
@@ -616,33 +607,19 @@ class GANLossTest(test.TestCase):
     self.assertEqual(3.0, reg_loss_gen_np - no_reg_loss_gen_np)
     self.assertEqual(2.0, reg_loss_dis_np - no_reg_loss_dis_np)
 
-  def test_regularization_gan(self):
-    self._test_regularization_helper(get_gan_model)
-
-  def test_regularization_callable_gan(self):
-    self._test_regularization_helper(get_callable_gan_model)
-
-  def test_regularization_infogan(self):
-    self._test_regularization_helper(get_infogan_model)
-
-  def test_regularization_callable_infogan(self):
-    self._test_regularization_helper(get_callable_infogan_model)
-
-  def test_regularization_acgan(self):
-    self._test_regularization_helper(get_acgan_model)
-
-  def test_regularization_callable_acgan(self):
-    self._test_regularization_helper(get_callable_acgan_model)
-
-  # Test that ACGan models work.
-  def _test_acgan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('notcallable', create_acgan_model),
+      ('callable', create_callable_acgan_model),
+  )
+  def test_acgan(self, create_gan_model_fn):
+    """Test that ACGAN models work."""
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     loss_ac_gen = train.gan_loss(model, aux_cond_generator_weight=1.0)
     loss_ac_dis = train.gan_loss(model, aux_cond_discriminator_weight=1.0)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
-    self.assertTrue(isinstance(loss_ac_gen, namedtuples.GANLoss))
-    self.assertTrue(isinstance(loss_ac_dis, namedtuples.GANLoss))
+    self.assertIsInstance(loss, namedtuples.GANLoss)
+    self.assertIsInstance(loss_ac_gen, namedtuples.GANLoss)
+    self.assertIsInstance(loss_ac_dis, namedtuples.GANLoss)
 
     # Check values.
     with self.test_session(use_gpu=True) as sess:
@@ -656,20 +633,18 @@ class GANLossTest(test.TestCase):
           loss_ac_dis.discriminator_loss
       ])
 
-    self.assertTrue(loss_gen_np < loss_dis_np)
+    self.assertLess(loss_gen_np, loss_dis_np)
     self.assertTrue(np.isscalar(loss_ac_gen_gen_np))
     self.assertTrue(np.isscalar(loss_ac_dis_gen_np))
     self.assertTrue(np.isscalar(loss_ac_gen_dis_np))
     self.assertTrue(np.isscalar(loss_ac_dis_dis_np))
 
-  def test_acgan(self):
-    self._test_acgan_helper(create_acgan_model)
-
-  def test_callable_acgan(self):
-    self._test_acgan_helper(create_callable_acgan_model)
-
-  # Test that CycleGan models work.
-  def _test_cyclegan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('notcallable', create_cyclegan_model),
+      ('callable', create_callable_cyclegan_model),
+  )
+  def test_cyclegan(self, create_gan_model_fn):
+    """Test that CycleGan models work."""
     model = create_gan_model_fn()
     loss = train.cyclegan_loss(model)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
@@ -690,14 +665,65 @@ class GANLossTest(test.TestCase):
     self.assertTrue(np.isscalar(loss_y2x_gen_np))
     self.assertTrue(np.isscalar(loss_y2x_dis_np))
 
-  def test_cyclegan(self):
-    self._test_cyclegan_helper(create_cyclegan_model)
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_tensor_pool(self, create_gan_model_fn):
+    """Test tensor pool option."""
+    model = create_gan_model_fn()
+    tensor_pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=5)
+    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
+    self.assertIsInstance(loss, namedtuples.GANLoss)
 
-  def test_callable_cyclegan(self):
-    self._test_cyclegan_helper(create_callable_cyclegan_model)
+    # Check values.
+    with self.test_session(use_gpu=True) as sess:
+      variables.global_variables_initializer().run()
+      for _ in range(10):
+        sess.run([loss.generator_loss, loss.discriminator_loss])
+
+  def test_discriminator_only_sees_pool(self):
+    """Checks that discriminator only sees pooled values."""
+    def checker_gen_fn(_):
+      return constant_op.constant(0.0)
+    model = train.gan_model(
+        checker_gen_fn,
+        discriminator_model,
+        real_data=array_ops.zeros([]),
+        generator_inputs=random_ops.random_normal([]))
+    def tensor_pool_fn(_):
+      return (random_ops.random_uniform([]), random_ops.random_uniform([]))
+    def checker_dis_fn(inputs, _):
+      """Discriminator that checks that it only sees pooled Tensors."""
+      self.assertFalse(constant_op.is_constant(inputs))
+      return inputs
+    model = model._replace(
+        discriminator_fn=checker_dis_fn)
+    train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
 
-  def _check_tensor_pool_adjusted_model_outputs(self, tensor1, tensor2,
-                                                pool_size):
+  def test_doesnt_crash_when_in_nested_scope(self):
+    with variable_scope.variable_scope('outer_scope'):
+      gan_model = train.gan_model(
+          generator_model,
+          discriminator_model,
+          real_data=array_ops.zeros([1, 2]),
+          generator_inputs=random_ops.random_normal([1, 2]))
+
+      # This should work inside a scope.
+      train.gan_loss(gan_model, gradient_penalty_weight=1.0)
+
+    # This should also work outside a scope.
+    train.gan_loss(gan_model, gradient_penalty_weight=1.0)
+
+
+class TensorPoolAdjusteModelTest(test.TestCase):
+
+  def _check_tensor_pool_adjusted_model_outputs(
+      self, tensor1, tensor2, pool_size):
     history_values = []
     with self.test_session(use_gpu=True) as sess:
       variables.global_variables_initializer().run()
@@ -714,115 +740,66 @@ class GANLossTest(test.TestCase):
           # pool).
           self.assertTrue(any([(v == t2).all() for v in history_values]))
 
-  # Test `_tensor_pool_adjusted_model` for gan model.
-  def test_tensor_pool_adjusted_model_gan(self):
-    model = create_gan_model()
-
-    new_model = train._tensor_pool_adjusted_model(model, None)
+  def _make_new_model_and_check(self, model, pool_size):
+    pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
+    new_model = train._tensor_pool_adjusted_model(model, pool_fn)
     # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
     self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIs(new_model.discriminator_gen_outputs,
-                  model.discriminator_gen_outputs)
-
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn(pool_size=pool_size))
     self.assertIsNot(new_model.discriminator_gen_outputs,
                      model.discriminator_gen_outputs)
+
+    return new_model
+
+  def test_tensor_pool_adjusted_model_gan(self):
+    """Test `_tensor_pool_adjusted_model` for gan model."""
+    pool_size = 5
+    model = create_gan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
+
     # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test _tensor_pool_adjusted_model for infogan model.
   def test_tensor_pool_adjusted_model_infogan(self):
+    """Test _tensor_pool_adjusted_model for infogan model."""
+    pool_size = 5
     model = create_infogan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
 
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn_for_infogan(pool_size=pool_size))
-    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
-    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIsNot(new_model.discriminator_gen_outputs,
-                     model.discriminator_gen_outputs)
+    # Check values.
     self.assertIsNot(new_model.predicted_distributions,
                      model.predicted_distributions)
-    # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test _tensor_pool_adjusted_model for acgan model.
   def test_tensor_pool_adjusted_model_acgan(self):
+    """Test _tensor_pool_adjusted_model for acgan model."""
+    pool_size = 5
     model = create_acgan_model()
+    new_model = self._make_new_model_and_check(model, pool_size)
 
-    pool_size = 5
-    new_model = train._tensor_pool_adjusted_model(
-        model, get_tensor_pool_fn(pool_size=pool_size))
-    # 'Generator/dummy_g:0' and 'Discriminator/dummy_d:0'
-    self.assertEqual(2, len(ops.get_collection(ops.GraphKeys.VARIABLES)))
-    self.assertIsNot(new_model.discriminator_gen_outputs,
-                     model.discriminator_gen_outputs)
+    # Check values.
     self.assertIsNot(new_model.discriminator_gen_classification_logits,
                      model.discriminator_gen_classification_logits)
-    # Check values.
     self._check_tensor_pool_adjusted_model_outputs(
         model.discriminator_gen_outputs, new_model.discriminator_gen_outputs,
         pool_size)
 
-  # Test tensor pool.
-  def _test_tensor_pool_helper(self, create_gan_model_fn):
-    model = create_gan_model_fn()
-    if isinstance(model, namedtuples.InfoGANModel):
-      tensor_pool_fn = get_tensor_pool_fn_for_infogan(pool_size=5)
-    else:
-      tensor_pool_fn = get_tensor_pool_fn(pool_size=5)
-    loss = train.gan_loss(model, tensor_pool_fn=tensor_pool_fn)
-    self.assertTrue(isinstance(loss, namedtuples.GANLoss))
-
-    # Check values.
-    with self.test_session(use_gpu=True) as sess:
-      variables.global_variables_initializer().run()
-      for _ in range(10):
-        sess.run([loss.generator_loss, loss.discriminator_loss])
-
-  def test_tensor_pool_gan(self):
-    self._test_tensor_pool_helper(create_gan_model)
-
-  def test_tensor_pool_callable_gan(self):
-    self._test_tensor_pool_helper(create_callable_gan_model)
-
-  def test_tensor_pool_infogan(self):
-    self._test_tensor_pool_helper(create_infogan_model)
-
-  def test_tensor_pool_callable_infogan(self):
-    self._test_tensor_pool_helper(create_callable_infogan_model)
-
-  def test_tensor_pool_acgan(self):
-    self._test_tensor_pool_helper(create_acgan_model)
-
-  def test_tensor_pool_callable_acgan(self):
-    self._test_tensor_pool_helper(create_callable_acgan_model)
-
-  def test_doesnt_crash_when_in_nested_scope(self):
-    with variable_scope.variable_scope('outer_scope'):
-      gan_model = train.gan_model(
-          generator_model,
-          discriminator_model,
-          real_data=array_ops.zeros([1, 2]),
-          generator_inputs=random_ops.random_normal([1, 2]))
-
-      # This should work inside a scope.
-      train.gan_loss(gan_model, gradient_penalty_weight=1.0)
-
-    # This should also work outside a scope.
-    train.gan_loss(gan_model, gradient_penalty_weight=1.0)
-
 
-class GANTrainOpsTest(test.TestCase):
+class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train_ops`."""
 
-  def _test_output_type_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_output_type(self, create_gan_model_fn):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
 
@@ -836,28 +813,24 @@ class GANTrainOpsTest(test.TestCase):
         summarize_gradients=True,
         colocate_gradients_with_ops=True)
 
-    self.assertTrue(isinstance(train_ops, namedtuples.GANTrainOps))
-
-  def test_output_type_gan(self):
-    self._test_output_type_helper(create_gan_model)
-
-  def test_output_type_callable_gan(self):
-    self._test_output_type_helper(create_callable_gan_model)
-
-  def test_output_type_infogan(self):
-    self._test_output_type_helper(create_infogan_model)
-
-  def test_output_type_callable_infogan(self):
-    self._test_output_type_helper(create_callable_infogan_model)
-
-  def test_output_type_acgan(self):
-    self._test_output_type_helper(create_acgan_model)
-
-  def test_output_type_callable_acgan(self):
-    self._test_output_type_helper(create_callable_acgan_model)
+    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
 
   # TODO(joelshor): Add a test to check that custom update op is run.
-  def _test_unused_update_ops(self, create_gan_model_fn, provide_update_ops):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('gan_provideupdates', create_gan_model, True),
+      ('callable_gan', create_callable_gan_model, False),
+      ('callable_gan_provideupdates', create_callable_gan_model, True),
+      ('infogan', create_infogan_model, False),
+      ('infogan_provideupdates', create_infogan_model, True),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('callable_infogan_provideupdates', create_callable_infogan_model, True),
+      ('acgan', create_acgan_model, False),
+      ('acgan_provideupdates', create_acgan_model, True),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('callable_acgan_provideupdates', create_callable_acgan_model, True),
+  )
+  def test_unused_update_ops(self, create_gan_model_fn, provide_update_ops):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
 
@@ -904,45 +877,16 @@ class GANTrainOpsTest(test.TestCase):
       self.assertEqual(1, gen_update_count.eval())
       self.assertEqual(1, dis_update_count.eval())
 
-  def test_unused_update_ops_gan(self):
-    self._test_unused_update_ops(create_gan_model, False)
-
-  def test_unused_update_ops_gan_provideupdates(self):
-    self._test_unused_update_ops(create_gan_model, True)
-
-  def test_unused_update_ops_callable_gan(self):
-    self._test_unused_update_ops(create_callable_gan_model, False)
-
-  def test_unused_update_ops_callable_gan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_gan_model, True)
-
-  def test_unused_update_ops_infogan(self):
-    self._test_unused_update_ops(create_infogan_model, False)
-
-  def test_unused_update_ops_infogan_provideupdates(self):
-    self._test_unused_update_ops(create_infogan_model, True)
-
-  def test_unused_update_ops_callable_infogan(self):
-    self._test_unused_update_ops(create_callable_infogan_model, False)
-
-  def test_unused_update_ops_callable_infogan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_infogan_model, True)
-
-  def test_unused_update_ops_acgan(self):
-    self._test_unused_update_ops(create_acgan_model, False)
-
-  def test_unused_update_ops_acgan_provideupdates(self):
-    self._test_unused_update_ops(create_acgan_model, True)
-
-  def test_unused_update_ops_callable_acgan(self):
-    self._test_unused_update_ops(create_callable_acgan_model, False)
-
-  def test_unused_update_ops_callable_acgan_provideupdates(self):
-    self._test_unused_update_ops(create_callable_acgan_model, True)
-
-  def _test_sync_replicas_helper(self,
-                                 create_gan_model_fn,
-                                 create_global_step=False):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model, False),
+      ('callable_gan', create_callable_gan_model, False),
+      ('infogan', create_infogan_model, False),
+      ('callable_infogan', create_callable_infogan_model, False),
+      ('acgan', create_acgan_model, False),
+      ('callable_acgan', create_callable_acgan_model, False),
+      ('gan_canbeint32', create_gan_model, True),
+  )
+  def test_sync_replicas(self, create_gan_model_fn, create_global_step):
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
     num_trainable_vars = len(variables_lib.get_trainable_variables())
@@ -956,7 +900,7 @@ class GANTrainOpsTest(test.TestCase):
     d_opt = get_sync_optimizer()
     train_ops = train.gan_train_ops(
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
-    self.assertTrue(isinstance(train_ops, namedtuples.GANTrainOps))
+    self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
     self.assertEqual(num_trainable_vars,
                      len(variables_lib.get_trainable_variables()))
@@ -994,29 +938,8 @@ class GANTrainOpsTest(test.TestCase):
       coord.request_stop()
       coord.join(g_threads + d_threads)
 
-  def test_sync_replicas_gan(self):
-    self._test_sync_replicas_helper(create_gan_model)
-
-  def test_sync_replicas_callable_gan(self):
-    self._test_sync_replicas_helper(create_callable_gan_model)
-
-  def test_sync_replicas_infogan(self):
-    self._test_sync_replicas_helper(create_infogan_model)
-
-  def test_sync_replicas_callable_infogan(self):
-    self._test_sync_replicas_helper(create_callable_infogan_model)
-
-  def test_sync_replicas_acgan(self):
-    self._test_sync_replicas_helper(create_acgan_model)
 
-  def test_sync_replicas_callable_acgan(self):
-    self._test_sync_replicas_helper(create_callable_acgan_model)
-
-  def test_global_step_can_be_int32(self):
-    self._test_sync_replicas_helper(create_gan_model, create_global_step=True)
-
-
-class GANTrainTest(test.TestCase):
+class GANTrainTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train`."""
 
   def _gan_train_ops(self, generator_add, discriminator_add):
@@ -1032,7 +955,15 @@ class GANTrainTest(test.TestCase):
         global_step_inc_op=step.assign_add(1))
     return train_ops
 
-  def _test_run_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_run_helper(self, create_gan_model_fn):
     random_seed.set_random_seed(1234)
     model = create_gan_model_fn()
     loss = train.gan_loss(model)
@@ -1048,26 +979,12 @@ class GANTrainTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(2, final_step)
 
-  def test_run_gan(self):
-    self._test_run_helper(create_gan_model)
-
-  def test_run_callable_gan(self):
-    self._test_run_helper(create_callable_gan_model)
-
-  def test_run_infogan(self):
-    self._test_run_helper(create_infogan_model)
-
-  def test_run_callable_infogan(self):
-    self._test_run_helper(create_callable_infogan_model)
-
-  def test_run_acgan(self):
-    self._test_run_helper(create_acgan_model)
-
-  def test_run_callable_acgan(self):
-    self._test_run_helper(create_callable_acgan_model)
-
-  # Test multiple train steps.
-  def _test_multiple_steps_helper(self, get_hooks_fn_fn):
+  @parameterized.named_parameters(
+      ('seq_train_steps', train.get_sequential_train_hooks),
+      ('efficient_seq_train_steps', train.get_joint_train_hooks),
+  )
+  def test_multiple_steps(self, get_hooks_fn_fn):
+    """Test multiple train steps."""
     train_ops = self._gan_train_ops(generator_add=10, discriminator_add=100)
     train_steps = namedtuples.GANTrainSteps(
         generator_train_steps=3, discriminator_train_steps=4)
@@ -1080,12 +997,6 @@ class GANTrainTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(1 + 3 * 10 + 4 * 100, final_step)
 
-  def test_multiple_steps_seq_train_steps(self):
-    self._test_multiple_steps_helper(train.get_sequential_train_hooks)
-
-  def test_multiple_steps_efficient_seq_train_steps(self):
-    self._test_multiple_steps_helper(train.get_joint_train_hooks)
-
   def test_supervisor_run_gan_model_train_ops_multiple_steps(self):
     step = training_util.create_global_step()
     train_ops = namedtuples.GANTrainOps(
@@ -1105,10 +1016,18 @@ class GANTrainTest(test.TestCase):
     self.assertEqual(17.0, final_loss)
 
 
-class PatchGANTest(test.TestCase):
+class PatchGANTest(test.TestCase, parameterized.TestCase):
   """Tests that functions work on PatchGAN style output."""
 
-  def _test_patchgan_helper(self, create_gan_model_fn):
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_patchgan(self, create_gan_model_fn):
     """Ensure that patch-based discriminators work end-to-end."""
     random_seed.set_random_seed(1234)
     model = create_gan_model_fn()
@@ -1125,24 +1044,6 @@ class PatchGANTest(test.TestCase):
     self.assertTrue(np.isscalar(final_step))
     self.assertEqual(2, final_step)
 
-  def test_patchgan_gan(self):
-    self._test_patchgan_helper(create_gan_model)
-
-  def test_patchgan_callable_gan(self):
-    self._test_patchgan_helper(create_callable_gan_model)
-
-  def test_patchgan_infogan(self):
-    self._test_patchgan_helper(create_infogan_model)
-
-  def test_patchgan_callable_infogan(self):
-    self._test_patchgan_helper(create_callable_infogan_model)
-
-  def test_patchgan_acgan(self):
-    self._test_patchgan_helper(create_acgan_model)
-
-  def test_patchgan_callable_acgan(self):
-    self._test_patchgan_helper(create_callable_acgan_model)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/gdr/gdr_memory_manager.cc b/tensorflow/contrib/gdr/gdr_memory_manager.cc
index 1435e19109ca2f3bbd6ce70e6e5f26a92dfc2713..f3bbf6b4d78b50b11e23abd584bacff8f3d877c7 100644
--- a/tensorflow/contrib/gdr/gdr_memory_manager.cc
+++ b/tensorflow/contrib/gdr/gdr_memory_manager.cc
@@ -33,10 +33,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #if GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#include "tensorflow/core/common_runtime/process_state.h"
 #endif  // GOOGLE_CUDA
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -182,28 +183,25 @@ class GdrMemoryManager : public RemoteMemoryManager {
   TF_DISALLOW_COPY_AND_ASSIGN(GdrMemoryManager);
 };
 
-// TODO(byronyi): remove this class duplicated from the one in
-// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
-class BasicCPUAllocator : public SubAllocator {
- public:
-  ~BasicCPUAllocator() override {}
-
-  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
-  }
-  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
-};
-
 // TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator
+// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
+// longer in use.
 class BFCRdmaAllocator : public BFCAllocator {
  public:
   BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
+                     true, "cpu_rdma_bfc") {}
+};
+class BFCRdmaAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() override { return new BFCRdmaAllocator; }
+
+  virtual SubAllocator* CreateSubAllocator(int numa_node) {
+    return new BasicCPUAllocator(numa_node);
   }
 };
 
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
 
 GdrMemoryManager::GdrMemoryManager(const string& host, const string& port)
     : host_(host),
@@ -276,8 +274,8 @@ Status GdrMemoryManager::Init() {
   Allocator* allocators[] = {
 #if GOOGLE_CUDA
     GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-    ProcessState::singleton()->GetCPUAllocator(0),
 #endif  // GOOGLE_CUDA
+    ProcessState::singleton()->GetCPUAllocator(0),
     cpu_allocator(),
   };
 
diff --git a/tensorflow/contrib/graph_editor/reroute.py b/tensorflow/contrib/graph_editor/reroute.py
index 95c02a64d47c26e731ef2628fb551529e9bc3f4d..d42e0c01f455f861e9ccdbfb79aefab762e61abe 100644
--- a/tensorflow/contrib/graph_editor/reroute.py
+++ b/tensorflow/contrib/graph_editor/reroute.py
@@ -208,9 +208,9 @@ def _reroute_ts(ts0, ts1, mode, can_modify=None, cannot_modify=None):
 def swap_ts(ts0, ts1, can_modify=None, cannot_modify=None):
   """For each tensor's pair, swap the end of (t0,t1).
 
-  B0 B1     B0 B1
-  |  |    =>  X
-  A0 A1     A0 A1
+      B0 B1     B0 B1
+      |  |    =>  X
+      A0 A1     A0 A1
 
   Args:
     ts0: an object convertible to a list of `tf.Tensor`.
@@ -233,9 +233,9 @@ def swap_ts(ts0, ts1, can_modify=None, cannot_modify=None):
 def reroute_ts(ts0, ts1, can_modify=None, cannot_modify=None):
   """For each tensor's pair, replace the end of t1 by the end of t0.
 
-  B0 B1     B0 B1
-  |  |    => |/
-  A0 A1     A0 A1
+      B0 B1     B0 B1
+      |  |    => |/
+      A0 A1     A0 A1
 
   The end of the tensors in ts1 are left dangling.
 
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index beeabd6b65631cad88efd10d5faee1917e162e41..dd602cf3a9b7826a19408a78ef543bb0c4fbf84e 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -1702,19 +1702,22 @@ def _inner_flatten(inputs, new_rank, output_collections=None, scope=None):
   return utils.collect_named_outputs(output_collections, sc, flattened)
 
 
-def _model_variable_getter(getter,
-                           name,
-                           shape=None,
-                           dtype=None,
-                           initializer=None,
-                           regularizer=None,
-                           trainable=True,
-                           collections=None,
-                           caching_device=None,
-                           partitioner=None,
-                           rename=None,
-                           use_resource=None,
-                           **_):
+def _model_variable_getter(
+    getter,
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    rename=None,
+    use_resource=None,
+    synchronization=tf_variables.VariableSynchronization.AUTO,
+    aggregation=tf_variables.VariableAggregation.NONE,
+    **_):
   """Getter that uses model_variable for compatibility with core layers."""
   short_name = name.split('/')[-1]
   if rename and short_name in rename:
@@ -1732,7 +1735,9 @@ def _model_variable_getter(getter,
       caching_device=caching_device,
       partitioner=partitioner,
       custom_getter=getter,
-      use_resource=use_resource)
+      use_resource=use_resource,
+      synchronization=synchronization,
+      aggregation=aggregation)
 
 
 def _build_variable_getter(rename=None):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index 339c4e0e360ed9ef9906f0e51b64a0dc13826259..ded93d4a7fb473c0c5df446ea89c5ab7784e9f3c 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -563,10 +563,10 @@ def _mean_squared_loss(labels, logits, weights=None):
     labels = ops.convert_to_tensor(labels)
     # To prevent broadcasting inside "-".
     if len(labels.get_shape()) == 1:
-      labels = array_ops.expand_dims(labels, dim=(1,))
+      labels = array_ops.expand_dims(labels, axis=(1,))
     # TODO(zakaria): make sure it does not recreate the broadcast bug.
     if len(logits.get_shape()) == 1:
-      logits = array_ops.expand_dims(logits, dim=(1,))
+      logits = array_ops.expand_dims(logits, axis=(1,))
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
     loss = math_ops.square(logits - math_ops.to_float(labels), name=name)
     return _compute_weighted_loss(loss, weights)
@@ -579,10 +579,10 @@ def _poisson_loss(labels, logits, weights=None):
     labels = ops.convert_to_tensor(labels)
     # To prevent broadcasting inside "-".
     if len(labels.get_shape()) == 1:
-      labels = array_ops.expand_dims(labels, dim=(1,))
+      labels = array_ops.expand_dims(labels, axis=(1,))
     # TODO(zakaria): make sure it does not recreate the broadcast bug.
     if len(logits.get_shape()) == 1:
-      logits = array_ops.expand_dims(logits, dim=(1,))
+      logits = array_ops.expand_dims(logits, axis=(1,))
     logits.get_shape().assert_is_compatible_with(labels.get_shape())
     loss = nn.log_poisson_loss(labels, logits, compute_full_loss=True,
                                name=name)
@@ -797,7 +797,7 @@ def _log_loss_with_two_classes(labels, logits, weights=None):
     # TODO(ptucker): This will break for dynamic shapes.
     # sigmoid_cross_entropy_with_logits requires [batch_size, 1] labels.
     if len(labels.get_shape()) == 1:
-      labels = array_ops.expand_dims(labels, dim=(1,))
+      labels = array_ops.expand_dims(labels, axis=(1,))
     loss = nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits,
                                                 name=name)
     return _compute_weighted_loss(loss, weights)
diff --git a/tensorflow/contrib/lite/allocation.cc b/tensorflow/contrib/lite/allocation.cc
index c42622ff02fc2837b61b35f19e834276c0518d1e..ef6c14f08532a8d25ab9be6000bc0f24559074d2 100644
--- a/tensorflow/contrib/lite/allocation.cc
+++ b/tensorflow/contrib/lite/allocation.cc
@@ -99,7 +99,9 @@ FileCopyAllocation::FileCopyAllocation(const char* filename,
                             filename);
     return;
   }
-  copied_buffer_ = std::move(buffer);
+  // Versions of GCC before 6.2.0 don't support std::move from non-const
+  // char[] to const char[] unique_ptrs.
+  copied_buffer_.reset(const_cast<char const*>(buffer.release()));
 }
 
 FileCopyAllocation::~FileCopyAllocation() {}
diff --git a/tensorflow/contrib/lite/arena_planner.cc b/tensorflow/contrib/lite/arena_planner.cc
index 16a0e7162475391d0cdf73468fdf981a804350b1..02442575b3aeed04ac6569440dd52a4d5ddd4d98 100644
--- a/tensorflow/contrib/lite/arena_planner.cc
+++ b/tensorflow/contrib/lite/arena_planner.cc
@@ -17,14 +17,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace {
-
-// Memory allocation tuning
-constexpr const int kDefaultArenaAlignment = 64;
-constexpr const int kDefaultTensorAlignment = 4;
-
-}  // namespace
-
 struct AllocationInfo {
   // The node index requesting this allocation.
   int node;
@@ -36,13 +28,16 @@ struct AllocationInfo {
 
 ArenaPlanner::ArenaPlanner(TfLiteContext* context,
                            std::unique_ptr<GraphInfo> graph_info,
-                           bool preserve_inputs, bool preserve_intermediates)
+                           bool preserve_inputs, bool preserve_intermediates,
+                           int tensor_alignment)
     : context_(context),
       graph_info_(std::move(graph_info)),
       arena_(kDefaultArenaAlignment),
       persistent_arena_(kDefaultArenaAlignment),
       preserve_inputs_(preserve_inputs),
-      preserve_intermediates_(preserve_intermediates) {}
+      preserve_intermediates_(preserve_intermediates),
+      tensor_alignment_(tensor_alignment) {}
+
 ArenaPlanner::~ArenaPlanner() {}
 
 int64_t ArenaPlanner::BasePointer(TfLiteAllocationType type) {
@@ -264,14 +259,12 @@ TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) {
 TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) {
   TfLiteTensor& tensor = *graph_info_->tensor(tensor_index);
   if (tensor.allocation_type == kTfLiteArenaRw) {
-    TF_LITE_ENSURE_STATUS(arena_.Allocate(context_, kDefaultTensorAlignment,
-                                          tensor.bytes,
-                                          &allocs_[tensor_index]));
+    TF_LITE_ENSURE_STATUS(arena_.Allocate(
+        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
   }
   if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
-    TF_LITE_ENSURE_STATUS(
-        persistent_arena_.Allocate(context_, kDefaultTensorAlignment,
-                                   tensor.bytes, &allocs_[tensor_index]));
+    TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate(
+        context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index]));
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/arena_planner.h b/tensorflow/contrib/lite/arena_planner.h
index 82c866734f33aace5fce147393fc7284e6f33d2c..55003cf4e92d9ca79416c0f9f7a0c57e828af4ee 100644
--- a/tensorflow/contrib/lite/arena_planner.h
+++ b/tensorflow/contrib/lite/arena_planner.h
@@ -25,6 +25,10 @@ limitations under the License.
 
 namespace tflite {
 
+// Memory allocation tuning
+constexpr const int kDefaultArenaAlignment = 64;
+constexpr const int kDefaultTensorAlignment = 64;
+
 struct AllocationInfo;
 
 // A memory planner that makes all the allocations using arenas.
@@ -47,7 +51,8 @@ class ArenaPlanner : public MemoryPlanner {
   // graph will not share memory with any other tensor, effectively preserving
   // them until the end of inference.
   ArenaPlanner(TfLiteContext* context, std::unique_ptr<GraphInfo> graph_info,
-               bool preserve_inputs, bool preserve_intermediates);
+               bool preserve_inputs, bool preserve_intermediates,
+               int tensor_alignment = kDefaultTensorAlignment);
   ~ArenaPlanner() override;
   ArenaPlanner(const ArenaPlanner&) = delete;
   ArenaPlanner& operator=(const ArenaPlanner&) = delete;
@@ -112,6 +117,9 @@ class ArenaPlanner : public MemoryPlanner {
   // If true, then no overlapping of memory areas is done, meaning intermediates
   // results can be queried after running (modulo running delegates).
   bool preserve_intermediates_;
+
+  // Number of bytes that tensor buffers should be aligned to.
+  int tensor_alignment_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/arena_planner_test.cc b/tensorflow/contrib/lite/arena_planner_test.cc
index 1adb426d583186655660e3c0e2c7c73e0e25e488..7d7c41289cad95b73423a7218bf1e0516b2e87a2 100644
--- a/tensorflow/contrib/lite/arena_planner_test.cc
+++ b/tensorflow/contrib/lite/arena_planner_test.cc
@@ -24,6 +24,8 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+constexpr const int kTensorAlignment = 4;
+
 // A simple op to be used in tests, as syntactic sugar.
 class TestOp {
  public:
@@ -156,7 +158,7 @@ class ArenaPlannerTest : public ::testing::Test {
     context_.ReportError = ReportError;
     planner_.reset(new ArenaPlanner(
         &context_, std::unique_ptr<GraphInfo>(new TestGraphInfo(graph)),
-        preserve_inputs, /*preserve intermediates*/ false));
+        preserve_inputs, /*preserve intermediates*/ false, kTensorAlignment));
     CHECK(planner_->ResetAllocations() == kTfLiteOk);
     CHECK(planner_->PlanAllocations() == kTfLiteOk);
   }
@@ -178,8 +180,8 @@ class ArenaPlannerTest : public ::testing::Test {
     const TfLiteTensor& tensor = (*graph_->tensors())[tensor_index];
     int64_t offset = GetOffset(tensor_index) + tensor.bytes;
     // We must make sure the offset is aligned to kDefaultArenaAlignment.
-    if (offset % 4 != 0) {
-      offset += 4 - offset % 4;
+    if (offset % kTensorAlignment != 0) {
+      offset += kTensorAlignment - offset % kTensorAlignment;
     }
     return offset;
   };
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index bed862454ef9a30f222c2c2174656f8f2c821d60..2e91632459dc9236ba7e8c8b7a1f2bffd148ccb6 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -1,193 +1,214 @@
 """Generate Flatbuffer binary from json."""
+
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
+    "tf_cc_shared_object",
 )
 
 def tflite_copts():
-  """Defines compile time flags."""
-  copts = [
-      "-DFARMHASH_NO_CXX_STRING",
-  ] + select({
-          str(Label("//tensorflow:android_arm64")): [
-              "-std=c++11",
-              "-O3",
-          ],
-          str(Label("//tensorflow:android_arm")): [
-              "-mfpu=neon",
-              "-mfloat-abi=softfp",
-              "-std=c++11",
-              "-O3",
-          ],
-          str(Label("//tensorflow:android_x86")): [
-              "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
-          ],
-          str(Label("//tensorflow:ios_x86_64")): [
-              "-msse4.1",
-          ],
-          "//conditions:default": [],
-  }) + select({
-      str(Label("//tensorflow:with_default_optimizations")): [],
-      "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
-  })
+    """Defines compile time flags."""
+    copts = [
+        "-DFARMHASH_NO_CXX_STRING",
+    ] + select({
+        str(Label("//tensorflow:android_arm64")): [
+            "-std=c++11",
+            "-O3",
+        ],
+        str(Label("//tensorflow:android_arm")): [
+            "-mfpu=neon",
+            "-mfloat-abi=softfp",
+            "-std=c++11",
+            "-O3",
+        ],
+        str(Label("//tensorflow:android_x86")): [
+            "-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK",
+        ],
+        str(Label("//tensorflow:ios_x86_64")): [
+            "-msse4.1",
+        ],
+        str(Label("//tensorflow:windows")): [
+            "/DTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [],
+    }) + select({
+        str(Label("//tensorflow:with_default_optimizations")): [],
+        "//conditions:default": ["-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK"],
+    })
 
-  return copts
+    return copts
 
 LINKER_SCRIPT = "//tensorflow/contrib/lite/java/src/main/native:version_script.lds"
 
 def tflite_linkopts_unstripped():
-  """Defines linker flags to reduce size of TFLite binary.
+    """Defines linker flags to reduce size of TFLite binary.
 
-     These are useful when trying to investigate the relative size of the
-     symbols in TFLite.
+       These are useful when trying to investigate the relative size of the
+       symbols in TFLite.
 
-  Returns:
-     a select object with proper linkopts
-  """
-  return select({
-      "//tensorflow:android": [
-          "-Wl,--no-export-dynamic", # Only inc syms referenced by dynamic obj.
-          "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
-          "-Wl,--gc-sections", # Eliminate unused code and data.
-          "-Wl,--as-needed", # Don't link unused libs.
-      ],
-      "//tensorflow/contrib/lite:mips": [],
-      "//tensorflow/contrib/lite:mips64": [],
-      "//conditions:default": [
-          "-Wl,--icf=all",  # Identical code folding.
-      ],
-  })
+    Returns:
+       a select object with proper linkopts
+    """
+    return select({
+        "//tensorflow:android": [
+            "-Wl,--no-export-dynamic",  # Only inc syms referenced by dynamic obj.
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export.
+            "-Wl,--gc-sections",  # Eliminate unused code and data.
+            "-Wl,--as-needed",  # Don't link unused libs.
+        ],
+        "//tensorflow/contrib/lite:mips": [],
+        "//tensorflow/contrib/lite:mips64": [],
+        "//conditions:default": [
+            "-Wl,--icf=all",  # Identical code folding.
+        ],
+    })
 
 def tflite_jni_linkopts_unstripped():
-  """Defines linker flags to reduce size of TFLite binary with JNI.
+    """Defines linker flags to reduce size of TFLite binary with JNI.
 
-     These are useful when trying to investigate the relative size of the
-     symbols in TFLite.
+       These are useful when trying to investigate the relative size of the
+       symbols in TFLite.
 
-  Returns:
-     a select object with proper linkopts
-  """
-  return select({
-      "//tensorflow:android": [
-          "-Wl,--gc-sections", # Eliminate unused code and data.
-          "-Wl,--as-needed", # Don't link unused libs.
-      ],
-      "//tensorflow/contrib/lite:mips": [],
-      "//tensorflow/contrib/lite:mips64": [],
-      "//conditions:default": [
-          "-Wl,--icf=all",  # Identical code folding.
-      ],
-  })
+    Returns:
+       a select object with proper linkopts
+    """
+    return select({
+        "//tensorflow:android": [
+            "-Wl,--gc-sections",  # Eliminate unused code and data.
+            "-Wl,--as-needed",  # Don't link unused libs.
+        ],
+        "//tensorflow/contrib/lite:mips": [],
+        "//tensorflow/contrib/lite:mips64": [],
+        "//conditions:default": [
+            "-Wl,--icf=all",  # Identical code folding.
+        ],
+    })
 
 def tflite_linkopts():
-  """Defines linker flags to reduce size of TFLite binary."""
-  return tflite_linkopts_unstripped() + select({
-      "//tensorflow:android": [
-          "-s",  # Omit symbol table.
-      ],
-      "//conditions:default": [],
-  })
+    """Defines linker flags to reduce size of TFLite binary."""
+    return tflite_linkopts_unstripped() + select({
+        "//tensorflow:android": [
+            "-s",  # Omit symbol table.
+        ],
+        "//conditions:default": [],
+    })
 
 def tflite_jni_linkopts():
-  """Defines linker flags to reduce size of TFLite binary with JNI."""
-  return tflite_jni_linkopts_unstripped() + select({
-      "//tensorflow:android": [
-          "-s",  # Omit symbol table.
-          "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
-      ],
-      "//conditions:default": [],
-  })
+    """Defines linker flags to reduce size of TFLite binary with JNI."""
+    return tflite_jni_linkopts_unstripped() + select({
+        "//tensorflow:android": [
+            "-s",  # Omit symbol table.
+            "-latomic",  # Required for some uses of ISO C++11 <atomic> in x86.
+        ],
+        "//conditions:default": [],
+    })
+
+def tflite_jni_binary(
+        name,
+        copts = tflite_copts(),
+        linkopts = tflite_jni_linkopts(),
+        linkscript = LINKER_SCRIPT,
+        linkshared = 1,
+        linkstatic = 1,
+        deps = []):
+    """Builds a jni binary for TFLite."""
+    linkopts = linkopts + [
+        "-Wl,--version-script",  # Export only jni functions & classes.
+        "$(location {})".format(linkscript),
+    ]
+    native.cc_binary(
+        name = name,
+        copts = copts,
+        linkshared = linkshared,
+        linkstatic = linkstatic,
+        deps = deps + [linkscript],
+        linkopts = linkopts,
+    )
 
-def tflite_jni_binary(name,
-                      copts=tflite_copts(),
-                      linkopts=tflite_jni_linkopts(),
-                      linkscript=LINKER_SCRIPT,
-                      linkshared=1,
-                      linkstatic=1,
-                      deps=[]):
-  """Builds a jni binary for TFLite."""
-  linkopts = linkopts + [
-      "-Wl,--version-script",  # Export only jni functions & classes.
-      "$(location {})".format(linkscript),
-  ]
-  native.cc_binary(
+def tflite_cc_shared_object(name,
+                            copts=tflite_copts(),
+                            linkopts=[],
+                            linkstatic=1,
+                            deps=[]):
+  """Builds a shared object for TFLite."""
+  tf_cc_shared_object(
       name=name,
       copts=copts,
-      linkshared=linkshared,
       linkstatic=linkstatic,
-      deps= deps + [linkscript],
-      linkopts=linkopts)
+      linkopts=linkopts + tflite_jni_linkopts(),
+      framework_so=[],
+      deps=deps)
 
 def tf_to_tflite(name, src, options, out):
-  """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
+    """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
 
-  Args:
-    name: Name of rule.
-    src: name of the input graphdef file.
-    options: options passed to TOCO.
-    out: name of the output flatbuffer file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input graphdef file.
+      options: options passed to TOCO.
+      out: name of the output flatbuffer file.
+    """
 
-  toco_cmdline = " ".join([
-      "//tensorflow/contrib/lite/toco:toco",
-      "--input_format=TENSORFLOW_GRAPHDEF",
-      "--output_format=TFLITE",
-      ("--input_file=$(location %s)" % src),
-      ("--output_file=$(location %s)" % out),
-  ] + options )
-  native.genrule(
-      name = name,
-      srcs=[src],
-      outs=[out],
-      cmd = toco_cmdline,
-      tools= ["//tensorflow/contrib/lite/toco:toco"],
-  )
+    toco_cmdline = " ".join([
+        "//tensorflow/contrib/lite/toco:toco",
+        "--input_format=TENSORFLOW_GRAPHDEF",
+        "--output_format=TFLITE",
+        ("--input_file=$(location %s)" % src),
+        ("--output_file=$(location %s)" % out),
+    ] + options)
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        cmd = toco_cmdline,
+        tools = ["//tensorflow/contrib/lite/toco:toco"],
+    )
 
 def tflite_to_json(name, src, out):
-  """Convert a TF Lite flatbuffer to JSON.
+    """Convert a TF Lite flatbuffer to JSON.
 
-  Args:
-    name: Name of rule.
-    src: name of the input flatbuffer file.
-    out: name of the output JSON file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input flatbuffer file.
+      out: name of the output JSON file.
+    """
 
-  flatc = "@flatbuffers//:flatc"
-  schema = "//tensorflow/contrib/lite/schema:schema.fbs"
-  native.genrule(
-      name = name,
-      srcs = [schema, src],
-      outs = [out],
-      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.bin &&"  +
-             "$(location %s) --raw-binary --strict-json -t" +
-             " -o /tmp $(location %s) -- $${TMP}.bin &&" +
-             "cp $${TMP}.json $(location %s)")
-            % (src, flatc, schema, out),
-      tools = [flatc],
-  )
+    flatc = "@flatbuffers//:flatc"
+    schema = "//tensorflow/contrib/lite/schema:schema.fbs"
+    native.genrule(
+        name = name,
+        srcs = [schema, src],
+        outs = [out],
+        cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.bin &&" +
+               "$(location %s) --raw-binary --strict-json -t" +
+               " -o /tmp $(location %s) -- $${TMP}.bin &&" +
+               "cp $${TMP}.json $(location %s)") %
+              (src, flatc, schema, out),
+        tools = [flatc],
+    )
 
 def json_to_tflite(name, src, out):
-  """Convert a JSON file to TF Lite's flatbuffer.
+    """Convert a JSON file to TF Lite's flatbuffer.
 
-  Args:
-    name: Name of rule.
-    src: name of the input JSON file.
-    out: name of the output flatbuffer file.
-  """
+    Args:
+      name: Name of rule.
+      src: name of the input JSON file.
+      out: name of the output flatbuffer file.
+    """
 
-  flatc = "@flatbuffers//:flatc"
-  schema = "//tensorflow/contrib/lite/schema:schema_fbs"
-  native.genrule(
-      name = name,
-      srcs = [schema, src],
-      outs = [out],
-      cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.json &&"  +
-             "$(location %s) --raw-binary --unknown-json --allow-non-utf8 -b" +
-             " -o /tmp $(location %s) $${TMP}.json &&" +
-             "cp $${TMP}.bin $(location %s)")
-      % (src, flatc, schema, out),
-      tools = [flatc],
-  )
+    flatc = "@flatbuffers//:flatc"
+    schema = "//tensorflow/contrib/lite/schema:schema_fbs"
+    native.genrule(
+        name = name,
+        srcs = [schema, src],
+        outs = [out],
+        cmd = ("TMP=`mktemp`; cp $(location %s) $${TMP}.json &&" +
+               "$(location %s) --raw-binary --unknown-json --allow-non-utf8 -b" +
+               " -o /tmp $(location %s) $${TMP}.json &&" +
+               "cp $${TMP}.bin $(location %s)") %
+              (src, flatc, schema, out),
+        tools = [flatc],
+    )
 
 # This is the master list of generated examples that will be made into tests. A
 # function called make_XXX_tests() must also appear in generate_examples.py.
@@ -230,12 +251,14 @@ def generated_test_models():
         "mul",
         "neg",
         "not_equal",
+        "one_hot",
+        "pack",
         "pad",
         "padv2",
         "prelu",
         "pow",
         "reduce_max",
-        "reduce_prod",
+        #"reduce_prod",  # disabled due to b/111823366
         "relu",
         "relu1",
         "relu6",
@@ -264,58 +287,58 @@ def generated_test_models():
     ]
 
 def gen_zip_test(name, test_name, **kwargs):
-  """Generate a zipped-example test and its dependent zip files.
+    """Generate a zipped-example test and its dependent zip files.
 
-  Args:
-    name: Resulting cc_test target name
-    test_name: Test targets this model. Comes from the list above.
-    **kwargs: tf_cc_test kwargs.
-  """
-  gen_zipped_test_file(
-      name = "zip_%s" % test_name,
-      file = "%s.zip" % test_name,
-  )
-  tf_cc_test(name, **kwargs)
+    Args:
+      name: Resulting cc_test target name
+      test_name: Test targets this model. Comes from the list above.
+      **kwargs: tf_cc_test kwargs.
+    """
+    gen_zipped_test_file(
+        name = "zip_%s" % test_name,
+        file = "%s.zip" % test_name,
+    )
+    tf_cc_test(name, **kwargs)
 
 def gen_zipped_test_file(name, file):
-  """Generate a zip file of tests by using :generate_examples.
+    """Generate a zip file of tests by using :generate_examples.
 
-  Args:
-    name: Name of output. We will produce "`file`.files" as a target.
-    file: The name of one of the generated_examples targets, e.g. "transpose"
-  """
-  toco = "//tensorflow/contrib/lite/toco:toco"
-  native.genrule(
-      name = file + ".files",
-      cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-             + " --zip_to_output " + file + " $(@D)"),
-      outs = [file],
-      tools = [
-          ":generate_examples",
-          toco,
-      ],
-  )
+    Args:
+      name: Name of output. We will produce "`file`.files" as a target.
+      file: The name of one of the generated_examples targets, e.g. "transpose"
+    """
+    toco = "//tensorflow/contrib/lite/toco:toco"
+    native.genrule(
+        name = file + ".files",
+        cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco +
+               " --zip_to_output " + file + " $(@D)"),
+        outs = [file],
+        tools = [
+            ":generate_examples",
+            toco,
+        ],
+    )
 
-  native.filegroup(
-      name = name,
-      srcs = [file],
-  )
+    native.filegroup(
+        name = name,
+        srcs = [file],
+    )
 
 def gen_selected_ops(name, model):
-  """Generate the library that includes only used ops.
+    """Generate the library that includes only used ops.
 
-  Args:
-    name: Name of the generated library.
-    model: TFLite model to interpret.
-  """
-  out = name + "_registration.cc"
-  tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
-  tflite_path = "//tensorflow/contrib/lite"
-  native.genrule(
-      name = name,
-      srcs = [model],
-      outs = [out],
-      cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s")
-      % (tool, model, out, tflite_path[2:]),
-      tools = [tool],
-  )
+    Args:
+      name: Name of the generated library.
+      model: TFLite model to interpret.
+    """
+    out = name + "_registration.cc"
+    tool = "//tensorflow/contrib/lite/tools:generate_op_registrations"
+    tflite_path = "//tensorflow/contrib/lite"
+    native.genrule(
+        name = name,
+        srcs = [model],
+        outs = [out],
+        cmd = ("$(location %s) --input_model=$(location %s) --output_registration=$(location %s) --tflite_path=%s") %
+              (tool, model, out, tflite_path[2:]),
+        tools = [tool],
+    )
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index a24aaad7dda56a2ece059cced4a3ca2cf13c26b9..70178b2faabe85f8a53a94c2b5d2e3ea40c8ba05 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -277,6 +277,15 @@ typedef struct {
   bool narrow_range;
 } TfLiteFakeQuantParams;
 
+typedef struct {
+  int values_count;
+  int axis;
+} TfLitePackParams;
+
+typedef struct {
+  int axis;
+} TfLiteOneHotParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 558e5471213117a138ab9c5ad4f80d5051b2fb09..0b6568fd2fec583914de1d1594f29912425d8b40 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -109,6 +109,8 @@ typedef enum {
   kTfLiteBuiltinReduceProd = 81,
   kTfLiteBuiltinReduceMax = 82,
   kTfLiteBuiltinPack = 83,
+  kTfLiteBuiltinLogicalOr = 84,
+  kTfLiteBuiltinOneHot = 85,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 1ff8843fa78f48fc74b4d7e7d0cc4ae2a0d255af..5bc20106d31357e2da3f005baee0f8d134d37be2 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -29,6 +29,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 #define TENSORFLOW_CONTRIB_LITE_CONTEXT_H_
 
+#if defined(_MSC_VER)
+#include <complex.h>
+#endif
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
@@ -180,7 +183,11 @@ typedef union {
   uint8_t* uint8;
   bool* b;
   int16_t* i16;
+#if defined(_MSC_VER)
+  _Fcomplex* c64;
+#else
   _Complex float* c64;
+#endif
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -464,6 +471,12 @@ typedef struct _TfLiteDelegate {
 } TfLiteDelegate;
 
 // WARNING: This is an experimental interface that is subject to change.
+//
+// Currently, TfLiteDelegateParams has to be allocated in a way that it's
+// trivially destructable. It will be stored as `builtin_data` field in
+// `TfLiteNode` of the delegate node.
+//
+// See also the `CreateDelegateParams` function in `interpreter.cc` details.
 typedef struct {
   TfLiteDelegate* delegate;
   TfLiteIntArray* nodes_to_replace;
diff --git a/tensorflow/contrib/lite/delegates/eager/BUILD b/tensorflow/contrib/lite/delegates/eager/BUILD
index 9f31ffdf67e2dc27aeea6e827a852b5868e3126c..a28707382ebaac421a077432a6efd4ea1f6bb0fb 100644
--- a/tensorflow/contrib/lite/delegates/eager/BUILD
+++ b/tensorflow/contrib/lite/delegates/eager/BUILD
@@ -42,10 +42,6 @@ cc_library(
     name = "delegate_data",
     srcs = ["delegate_data.cc"],
     hdrs = ["delegate_data.h"],
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
     deps = [
         ":buffer_map",
         "//tensorflow/core:core_cpu",
@@ -59,6 +55,7 @@ cc_test(
     size = "small",
     srcs = ["delegate_data_test.cc"],
     tags = [
+        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -70,6 +67,43 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "kernel",
+    srcs = ["kernel.cc"],
+    hdrs = ["kernel.h"],
+    deps = [
+        ":delegate_data",
+        ":util",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:execute",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "kernel_test",
+    size = "small",
+    srcs = ["kernel_test.cc"],
+    tags = [
+        "no_oss",
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":delegate_data",
+        ":kernel",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "util",
     srcs = ["util.cc"],
diff --git a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc b/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
index 1d6453f498a9474697240843ff8aff0d830e6a4f..e5a19c39976969a0b05b28596c6d7d5ebe7c7782 100644
--- a/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
+++ b/tensorflow/contrib/lite/delegates/eager/buffer_map.cc
@@ -91,6 +91,10 @@ void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
   for (int i = 0; i < num_dims; ++i) {
     shape.AddDim(tensor->dims->data[i]);
   }
+  // TODO(ahentz): we assume this is a new tensor and allocate a new buffer
+  // for it. This is not always the best approach. For example, this might
+  // be a reallocation after resizing tensors. In that case we would be
+  // preferable to somehow reuse the buffer.
   auto* buf = new TfLiteTensorBuffer(tensor);
   tensorflow::Tensor t = tensorflow::TensorCApi::MakeTensor(
       GetTensorFlowDataType(tensor->type), shape, buf);
diff --git a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc b/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
index 29687694bd0fba6b496f9b24c630d5929756ed83..0fd5c976f8ca9be16f7e3c5e610573755b40c506 100644
--- a/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
+++ b/tensorflow/contrib/lite/delegates/eager/delegate_data.cc
@@ -23,7 +23,8 @@ tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
   std::vector<tensorflow::Device*> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
-      tensorflow::SessionOptions(), "/device:cpu:*", &devices));
+      tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
+      &devices));
 
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
       new tensorflow::DeviceMgr(devices));
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.cc b/tensorflow/contrib/lite/delegates/eager/kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..172798180762f87e1c080be7788db661a63208b5
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.cc
@@ -0,0 +1,289 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+
+#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/builtin_ops.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/context_util.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/delegates/eager/util.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+
+// Note: this is part of TF Lite's Eager delegation code which is to be
+// completed soon.
+
+// This is the TF Lite op that is created by the eager delegate to handle
+// execution of a supported subgraph. The usual flow is that the delegate
+// informs the interpreter of supported nodes in a graph, and each supported
+// subgraph is replaced with one instance of this kernel.
+//
+// The kernel is initialized with TfLiteDelegateParams from which we retrieve
+// the global EagerContext and BufferMap, as well as a list of inputs and
+// outputs to the subgraph. Those are used to build the OpData, with a list of
+// TensorFlow Ops that should be executed in order (which we call an OpNode).
+//
+// For each node included in the subgraph, we query the interpreter and
+// retrieve the associated NodeDef, which is then used to configure the
+// corresponding TensorFlow/Eager Op.
+
+namespace tflite {
+namespace eager {
+namespace kernel {
+
+// Controls the lifetime of tensor handles in a vector.
+class VectorOfHandles {
+ public:
+  explicit VectorOfHandles(int num_elements) : vector_(num_elements, nullptr) {}
+
+  ~VectorOfHandles() {
+    for (auto* handle : vector_) {
+      if (handle) handle->Unref();
+    }
+  }
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* GetVector() {
+    return &vector_;
+  }
+
+  tensorflow::TensorHandle* GetHandle(int index) { return vector_[index]; }
+
+ private:
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> vector_;
+};
+
+// Executes the TensorFlow op given by 'op_name', with the attributes specified
+// in 'nodedef'. Inputs and outputs are given as indices into the 'buffer_map'.
+tensorflow::Status ExecuteEagerOp(tensorflow::EagerContext* eager_context,
+                                  BufferMap* buffer_map, const string& op_name,
+                                  const tensorflow::NodeDef& nodedef,
+                                  const std::vector<int>& inputs,
+                                  const std::vector<int>& outputs) {
+  const tensorflow::AttrTypeMap* attr_types;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      " (while processing attributes of '", op_name, "')");
+
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  for (const auto& attr : nodedef.attr()) {
+    op.MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  for (int input_index : inputs) {
+    if (!buffer_map->HasTensor(input_index)) {
+      return tensorflow::errors::Internal(
+          "Cannot read from invalid tensor index ", input_index);
+    }
+    auto* handle = new tensorflow::TensorHandle(
+        buffer_map->GetTensor(input_index), nullptr, nullptr, nullptr);
+    op.AddInput(handle);
+    handle->Unref();
+  }
+
+  int num_retvals = outputs.size();
+  VectorOfHandles retvals(num_retvals);
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      EagerExecute(&op, retvals.GetVector(), &num_retvals),
+      " (while executing '", op_name, "' via Eager)");
+
+  if (num_retvals != outputs.size()) {
+    return tensorflow::errors::Internal(
+        "Unexpected number of outputs from EagerExecute");
+  }
+
+  for (int i = 0; i < num_retvals; ++i) {
+    const tensorflow::Tensor* tensor = nullptr;
+    TF_RETURN_IF_ERROR(retvals.GetHandle(i)->Tensor(&tensor));
+    buffer_map->SetFromTensorFlow(outputs[i], *tensor);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+// A single node within the larger 'op'. Note that this kernel executes many
+// TensorFlow ops within a single TF Lite op.
+struct OpNode {
+  // The name of the TensorFlow op to execute.
+  string name;
+  // The corresponding NodeDef, containing the attributes for the op.
+  tensorflow::NodeDef nodedef;
+  // List of inputs, as TF Lite tensor indices.
+  std::vector<int> inputs;
+  // List of outputs, as TF Lite tensor indices.
+  std::vector<int> outputs;
+};
+
+// The Larger 'op', which contains all the nodes in a supported subgraph.
+struct OpData {
+  tensorflow::EagerContext* eager_context;
+  BufferMap* buffer_map;
+  std::vector<OpNode> nodes;
+  std::vector<int> subgraph_inputs;
+  std::vector<int> subgraph_outputs;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+
+  const TfLiteDelegateParams* params =
+      reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+  CHECK(params);
+  CHECK(params->delegate);
+  CHECK(params->delegate->data_);
+  op_data->eager_context =
+      reinterpret_cast<DelegateData*>(params->delegate->data_)
+          ->GetEagerContext();
+  op_data->buffer_map =
+      reinterpret_cast<DelegateData*>(params->delegate->data_)->GetBufferMap();
+
+  CHECK(params->output_tensors);
+  for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
+    op_data->subgraph_outputs.push_back(tensor_index);
+  }
+
+  CHECK(params->input_tensors);
+  for (auto tensor_index : TfLiteIntArrayView(params->input_tensors)) {
+    op_data->subgraph_inputs.push_back(tensor_index);
+  }
+
+  CHECK(params->nodes_to_replace);
+  for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+    TfLiteNode* node;
+    TfLiteRegistration* reg;
+    context->GetNodeAndRegistration(context, node_index, &node, &reg);
+
+    op_data->nodes.push_back(OpNode());
+    OpNode& node_data = op_data->nodes.back();
+
+    node_data.name = "";
+    if (node->custom_initial_data) {
+      // The flexbuffer contains a vector where the first elements is the
+      // op name and the second is a serialized NodeDef.
+      const flexbuffers::Vector& v =
+          flexbuffers::GetRoot(
+              reinterpret_cast<const uint8_t*>(node->custom_initial_data),
+              node->custom_initial_data_size)
+              .AsVector();
+
+      node_data.name = v[0].AsString().str();
+      if (!node_data.nodedef.ParseFromString(v[1].AsString().str())) {
+        // We will just leave the nodedef empty and error out in Eval().
+        node_data.nodedef.Clear();
+      }
+    }
+
+    for (auto input_index : TfLiteIntArrayView(node->inputs)) {
+      node_data.inputs.push_back(input_index);
+    }
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      node_data.outputs.push_back(output_index);
+    }
+  }
+
+  return op_data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE_MSG(
+      context, op_data->eager_context != nullptr,
+      "Failed to initialize eager context. This often happens when a CPU "
+      "device has not been registered, presumably because some symbols from "
+      "tensorflow/core:core_cpu_impl were not linked into the binary.");
+
+  // Whenever we find a constant tensor, insert it in the buffer map.
+  BufferMap* buffer_map = op_data->buffer_map;
+  for (auto tensor_index : op_data->subgraph_inputs) {
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    if (IsConstantTensor(tensor)) {
+      if (!buffer_map->HasTensor(tensor_index)) {
+        buffer_map->SetFromTfLite(tensor_index, tensor);
+      }
+    }
+  }
+
+  // All output tensors are allocated by TensorFlow/Eager, so we
+  // mark them as kTfLiteDynamic.
+  for (auto tensor_index : op_data->subgraph_outputs) {
+    SetTensorToDynamic(&context->tensors[tensor_index]);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  BufferMap* buffer_map = op_data->buffer_map;
+  tensorflow::EagerContext* eager_context = op_data->eager_context;
+
+  // Insert a tensor in the buffer map for all inputs that are not constant.
+  // Constants were handled in Prepare() already.
+  for (auto tensor_index : op_data->subgraph_inputs) {
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    if (!IsConstantTensor(tensor)) {
+      buffer_map->SetFromTfLite(tensor_index, tensor);
+    }
+  }
+
+  // Execute the TensorFlow Ops sequentially.
+  for (const auto& node_data : op_data->nodes) {
+    if (node_data.nodedef.op().empty()) {
+      context->ReportError(context, "Invalid NodeDef in Eager op '%s'",
+                           node_data.name.c_str());
+      return kTfLiteError;
+    }
+    auto status =
+        ExecuteEagerOp(eager_context, buffer_map, node_data.name,
+                       node_data.nodedef, node_data.inputs, node_data.outputs);
+    TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
+  }
+
+  for (auto tensor_index : op_data->subgraph_outputs) {
+    if (!buffer_map->HasTensor(tensor_index)) {
+      context->ReportError(context, "Cannot write to invalid tensor index %d",
+                           tensor_index);
+      return kTfLiteError;
+    }
+
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    TF_LITE_ENSURE_OK(
+        context,
+        CopyShape(context, buffer_map->GetTensor(tensor_index), tensor));
+    tensor->buffer_handle = tensor_index;
+    tensor->data_is_stale = true;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace kernel
+
+TfLiteRegistration GetKernel() {
+  TfLiteRegistration registration{&kernel::Init,    &kernel::Free,
+                                  &kernel::Prepare, &kernel::Eval,
+                                  nullptr,          kTfLiteBuiltinDelegate};
+  return registration;
+}
+
+}  // namespace eager
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel.h b/tensorflow/contrib/lite/delegates/eager/kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..100672c82dcd3eaee17325f3b712140b081e8efe
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+namespace eager {
+
+// Return the registration object used to initialize and execute ops that will
+// be delegated to TensorFlow's Eager runtime. This TF Lite op is created by
+// the eager delegate to handle execution of a supported subgraph. The usual
+// flow is that the delegate informs the interpreter of supported nodes in a
+// graph, and each supported subgraph is replaced with one instance of this
+// kernel.
+TfLiteRegistration GetKernel();
+
+}  // namespace eager
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_EAGER_KERNEL_H_
diff --git a/tensorflow/contrib/lite/delegates/eager/kernel_test.cc b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d9dddef93346c8e20df0d3f84ece6197a605c86
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/eager/kernel_test.cc
@@ -0,0 +1,351 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/eager/kernel.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "third_party/flatbuffers/include/flatbuffers/flexbuffers.h"
+#include "tensorflow/contrib/lite/delegates/eager/delegate_data.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace eager {
+namespace {
+
+using tensorflow::protobuf::TextFormat;
+using ::testing::ContainsRegex;
+using ::testing::ElementsAre;
+
+// We will use these are custom_names, so they need to be static.
+static const char kIdentity[] = "Identity";
+static const char kUnpack[] = "Unpack";
+static const char kAdd[] = "Add";
+static const char kMul[] = "Mul";
+
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
+                            const std::vector<int>& supported_nodes) {
+  TfLiteIntArray* size_and_nodes =
+      ConvertVectorToTfLiteIntArray(supported_nodes);
+  TF_LITE_ENSURE_STATUS(context->ReplaceSubgraphsWithDelegateKernels(
+      context, eager::GetKernel(), size_and_nodes, delegate));
+  TfLiteIntArrayFree(size_and_nodes);
+  return kTfLiteOk;
+}
+
+class KernelTest : public ::testing::Test {
+ public:
+  KernelTest() {
+    CHECK(DelegateData::Create(&delegate_data_).ok());
+    interpreter_.reset(new Interpreter(&error_reporter_));
+  }
+
+  bool Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
+
+  void SetValues(int tensor_index, const std::vector<float>& values) {
+    float* v = interpreter_->typed_tensor<float>(tensor_index);
+    for (float f : values) {
+      *v++ = f;
+    }
+  }
+
+  std::vector<float> GetValues(int tensor_index) {
+    TfLiteTensor* o = interpreter_->tensor(tensor_index);
+    return std::vector<float>(o->data.f, o->data.f + o->bytes / sizeof(float));
+  }
+
+  void SetShape(int tensor_index, const std::vector<int>& values) {
+    ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
+    ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  }
+
+  std::vector<int> GetShape(int tensor_index) {
+    std::vector<int> result;
+    auto* dims = interpreter_->tensor(tensor_index)->dims;
+    for (int i = 0; i < dims->size; ++i) {
+      result.push_back(dims->data[i]);
+    }
+    return result;
+  }
+
+  template <typename T>
+  void ConfigureDelegate(T prepare_function) {
+    delegate_.data_ = delegate_data_.get();
+    delegate_.FreeBufferHandle = nullptr;
+    delegate_.Prepare = prepare_function;
+    delegate_.CopyFromBufferHandle = [](TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        void* data, size_t size) {
+      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
+      tensorflow::StringPiece values =
+          delegate_data->GetBufferMap()->GetTensor(buffer_handle).tensor_data();
+      memcpy(data, values.data(), values.size());
+      return kTfLiteOk;
+    };
+    CHECK(interpreter_->ModifyGraphWithDelegate(
+              &delegate_, /*allow_dynamic_tensors=*/true) == kTfLiteOk);
+  }
+
+  void AddOp(const char* name, const std::vector<int>& inputs,
+             const std::vector<int>& outputs) {
+    auto attr = [](const string& key, const string& value) {
+      return " attr{ key: '" + key + "' value {" + value + "}}";
+    };
+
+    string attributes;
+    if (name == string(kUnpack)) {
+      attributes = attr("T", "type: DT_FLOAT") + attr("num", "i: 2") +
+                   attr("axis", "i: 0");
+    } else if (name == string(kIdentity)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    } else if (name == string(kAdd)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    } else if (name == string(kMul)) {
+      attributes = attr("T", "type: DT_FLOAT");
+    }
+    AddTfOp(name, attributes, inputs, outputs);
+  }
+
+  void AddTensors(int num_tensors, const std::vector<int>& inputs,
+                  const std::vector<int>& outputs) {
+    interpreter_->AddTensors(num_tensors);
+    for (int i = 0; i < num_tensors; ++i) {
+      TfLiteQuantizationParams quant;
+      CHECK_EQ(interpreter_->SetTensorParametersReadWrite(i, kTfLiteFloat32,
+                                                          /*name=*/"",
+                                                          /*dims=*/{3}, quant),
+               kTfLiteOk);
+    }
+
+    CHECK_EQ(interpreter_->SetInputs(inputs), kTfLiteOk);
+    CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
+  }
+
+  const TestErrorReporter& error_reporter() const { return error_reporter_; }
+
+  void AddTfLiteOp(const char* name, const std::vector<int>& inputs,
+                   const std::vector<int>& outputs) {
+    CHECK_EQ(string(name), kMul);  // can only add MUL
+    static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    reg.builtin_code = BuiltinOperator_MUL;
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      auto* i0 = &context->tensors[node->inputs->data[0]];
+      auto* o = &context->tensors[node->outputs->data[0]];
+      return context->ResizeTensor(context, o, TfLiteIntArrayCopy(i0->dims));
+    };
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      auto* i0 = &context->tensors[node->inputs->data[0]];
+      auto* i1 = &context->tensors[node->inputs->data[1]];
+      auto* o = &context->tensors[node->outputs->data[0]];
+      for (int i = 0; i < o->bytes / sizeof(float); ++i) {
+        o->data.f[i] = i0->data.f[i] * i1->data.f[i];
+      }
+      return kTfLiteOk;
+    };
+
+    CHECK_EQ(interpreter_->AddNodeWithParameters(inputs, outputs, nullptr, 0,
+                                                 nullptr, &reg),
+             kTfLiteOk);
+  }
+
+ private:
+  void AddTfOp(const char* name, const string& nodedef_str,
+               const std::vector<int>& inputs,
+               const std::vector<int>& outputs) {
+    static TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    reg.builtin_code = BuiltinOperator_CUSTOM;
+    reg.custom_name = name;
+
+    tensorflow::NodeDef nodedef;
+    CHECK(TextFormat::ParseFromString(nodedef_str + " op: '" + name + "'",
+                                      &nodedef));
+    string serialized_nodedef;
+    CHECK(nodedef.SerializeToString(&serialized_nodedef));
+    flexbuffers::Builder fbb;
+    fbb.Vector([&]() {
+      fbb.String(nodedef.op());
+      fbb.String(serialized_nodedef);
+    });
+    fbb.Finish();
+
+    flexbuffers_.push_back(fbb.GetBuffer());
+    auto& buffer = flexbuffers_.back();
+    CHECK_EQ(interpreter_->AddNodeWithParameters(
+                 inputs, outputs, reinterpret_cast<const char*>(buffer.data()),
+                 buffer.size(), nullptr, &reg),
+             kTfLiteOk);
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<DelegateData> delegate_data_;
+  TfLiteDelegate delegate_;
+  std::vector<std::vector<uint8_t>> flexbuffers_;
+  TestErrorReporter error_reporter_;
+};
+
+TEST_F(KernelTest, FullGraph) {
+  // Define the graph.
+  AddTensors(9, {0, 3}, {8});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kUnpack, {3}, {4, 5});
+  AddOp(kAdd, {1, 4}, {6});
+  AddOp(kAdd, {2, 5}, {7});
+  AddOp(kMul, {6, 7}, {8});
+
+  // Apply Delegate.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
+  });
+
+  // Define inputs.
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+}
+
+TEST_F(KernelTest, BadTensorFlowOp) {
+  AddTensors(2, {0}, {1});
+  AddOp("NonExistentOp", {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while processing attributes of 'NonExistentOp'"));
+}
+
+TEST_F(KernelTest, BadNumberOfOutputs) {
+  AddTensors(3, {0}, {1, 2});
+  AddOp(kIdentity, {0}, {1, 2});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Unexpected number of outputs"));
+}
+
+TEST_F(KernelTest, IncompatibleNodeDef) {
+  AddTensors(2, {0}, {1});
+
+  // Cast is a TF op, but we don't add the proper nodedef to it in AddOp.
+  AddOp("Cast", {0}, {1});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("while executing 'Cast' via Eager"));
+}
+
+TEST_F(KernelTest, WrongSetOfNodes) {
+  AddTensors(4, {0}, {3});
+  AddOp(kUnpack, {0}, {1, 2});
+  AddTfLiteOp(kMul, {1, 2}, {3});
+
+  // Specify that kMul (#1) is supported when it actually isn't.
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_FALSE(Invoke());
+  ASSERT_THAT(error_reporter().error_messages(),
+              ContainsRegex("Invalid NodeDef in Eager op"));
+}
+
+TEST_F(KernelTest, MixedGraph) {
+  AddTensors(9, {0, 3}, {8});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kUnpack, {3}, {4, 5});
+  AddOp(kAdd, {1, 4}, {6});
+  AddOp(kAdd, {2, 5}, {7});
+  AddTfLiteOp(kMul, {6, 7}, {8});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 3});
+  });
+
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+}
+
+TEST_F(KernelTest, SplitGraph) {
+  AddTensors(10, {0}, {9});
+
+  AddOp(kUnpack, {0}, {1, 2});
+  AddOp(kAdd, {1, 2}, {3});
+  AddOp(kUnpack, {3}, {4, 5});
+
+  AddTfLiteOp(kMul, {4, 5}, {6});
+
+  AddOp(kUnpack, {6}, {7, 8});
+  AddOp(kAdd, {7, 8}, {9});
+
+  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
+    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5});
+  });
+
+  SetShape(0, {2, 2, 2, 1});
+  SetValues(0, {3.0f, 1.0f, 0.5f, -1.0f, 0.0f, 1.0f, 1.5f, 3.0f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(9), ElementsAre(1));
+  ASSERT_THAT(GetValues(9), ElementsAre(10.0f));
+}
+
+}  // namespace
+}  // namespace eager
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
index f0d16575ec5a0f9db799b8be44907e8a999a348c..60855eb8edc4fb708d76b1e3a4ac37d462a64465 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -436,7 +436,6 @@ class NNAPIDelegateKernel {
         }
         break;
       case kTfLiteBuiltinSqueeze:
-        // Squeeze requires NNAPI1.1.
         if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
           return [](TfLiteContext* context, NNAPIOpBuilder* builder,
                     TfLiteNode* node) -> ANeuralNetworksOperationType {
@@ -452,9 +451,227 @@ class NNAPIDelegateKernel {
         } else {
           return nullptr;
         }
+      case kTfLiteBuiltinL2Normalization: {
+        auto builtin =
+            reinterpret_cast<TfLiteL2NormParams*>(node->builtin_data);
+        if (builtin->activation != kTfLiteActNone) {
+          // NNAPI does not support activations
+          return nullptr;
+        }
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          return ANEURALNETWORKS_L2_NORMALIZATION;
+        };
+      }
+      case kTfLiteBuiltinLocalResponseNormalization:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLocalResponseNormParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->radius);
+            builder->AddScalarFloat32Operand(builtin->bias);
+            builder->AddScalarFloat32Operand(builtin->alpha);
+            builder->AddScalarFloat32Operand(builtin->beta);
+            return ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION;
+          };
+        } else {
+          // TODO(miaowang): clean-up code and return early in the unsupported
+          // case.
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLshProjection:
+        if (version == 1) {
+          // NNAPI does not support sparse projection correctly (b/111751836).
+          if (reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data)
+                  ->type == kTfLiteLshProjectionSparse) {
+            return nullptr;
+          }
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteLSHProjectionParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->type);
+            return ANEURALNETWORKS_LSH_PROJECTION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinConcatenation:
+        if (version == 1 &&
+            reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data)
+                    ->activation == kTfLiteActNone) {
+          if (context->tensors[node->inputs->data[0]].type == kTfLiteUInt8) {
+            // NNAPI only support concatenating quantized tensor of the same
+            // scale and offset.
+            auto first_param = context->tensors[node->inputs->data[0]].params;
+            for (int i = 0; i < node->inputs->size; i++) {
+              auto curr_param = context->tensors[node->inputs->data[i]].params;
+              if (curr_param.scale != first_param.scale ||
+                  curr_param.zero_point != first_param.zero_point) {
+                return nullptr;
+              }
+            }
+          }
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin = reinterpret_cast<TfLiteConcatenationParams*>(
+                node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->axis);
+            return ANEURALNETWORKS_CONCATENATION;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDequantize:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_DEQUANTIZE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinFloor:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_FLOOR;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinReluN1To1:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU1;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinRelu6:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_RELU6;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinLogistic:
+        if (version == 1) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_LOGISTIC;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinTanh:
+        // TODO(miaowang): add additional checks for the parameters.
+        if (version == 1 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float tanh.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_TANH;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSub:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float sub.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_SUB;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinDiv:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI only support float div.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->activation);
+            return ANEURALNETWORKS_DIV;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinPad:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11 &&
+            node->inputs->size == 2 &&
+            context->tensors[node->inputs->data[0]].type == kTfLiteFloat32) {
+          // NNAPI does not support specifying the padding value.
+          // NNAPI pads physical zero for quantized tensors, so only delegate
+          // float pad to NNAPI.
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_PAD;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinSpaceToBatchNd:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            return ANEURALNETWORKS_SPACE_TO_BATCH_ND;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
+      case kTfLiteBuiltinStridedSlice:
+        if (version == 1 && kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) {
+          return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                    TfLiteNode* node) -> ANeuralNetworksOperationType {
+            auto builtin =
+                reinterpret_cast<TfLiteStridedSliceParams*>(node->builtin_data);
+            builder->AddScalarInt32Operand(builtin->begin_mask);
+            builder->AddScalarInt32Operand(builtin->end_mask);
+            builder->AddScalarInt32Operand(builtin->shrink_axis_mask);
+            return ANEURALNETWORKS_STRIDED_SLICE;
+          };
+        } else {
+          return nullptr;
+        }
+        break;
       case kTfLiteBuiltinTranspose:
-        // Transpose requires NNAPI1.1. Also note that the permutation input
-        // tensor value dictates the output dimensions.
+        // Note that the permutation input tensor value dictates the output
+        // dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((version == 1) &&
             (kAndroidSdkVersion >= kMinSdkVersionForNNAPI11) &&
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
index ab2181e8ff7b5b4e73dc0264a6e93591d2ee225a..b7b159c59f2f81b055d5d06436b70331cff3dea8 100644
--- a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -82,7 +82,7 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
-class FloatMulOpModel : public SingleOpModel {
+class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
                   const TensorData& output,
@@ -184,7 +184,7 @@ TEST(NNAPIDelegate, L2PoolWithNoActivation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3.5, 6.5}));
 }
 
-class BaseConvolutionOpModel : public SingleOpModel {
+class BaseConvolutionOpModel : public SingleOpModelWithNNAPI {
  public:
   BaseConvolutionOpModel(
       const TensorData& input, const TensorData& filter,
@@ -641,6 +641,41 @@ TEST(NNAPIDelegate, SqueezeWithAxisTest) {
                         17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}));
 }
 
+class L2NormOpModel : public SingleOpModelWithNNAPI {
+ public:
+  L2NormOpModel(const TensorData& input, const TensorData& output,
+                ActivationFunctionType activation_type) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_L2_NORMALIZATION, BuiltinOptions_L2NormOptions,
+                 CreateL2NormOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int new_shape_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, L2NormSimpleTest) {
+  std::initializer_list<float> data = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
+  L2NormOpModel m({TensorType_FLOAT32, {1, 1, 1, 6}},
+                  {TensorType_FLOAT32, {1, 1, 1, 6}},
+                  ActivationFunctionType_NONE);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 1, 6}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
+}
+
 class TransposeSimpleModel : public SingleOpModelWithNNAPI {
  public:
   TransposeSimpleModel(std::initializer_list<int> input_shape,
@@ -678,6 +713,916 @@ TEST(NNAPIDelegate, TransposeSimpleTest) {
                                 2, 6, 10, 14, 18, 22, 3, 7, 11, 15, 19, 23}));
 }
 
+class FloatSubOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloatSubOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SUB, BuiltinOptions_SubOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, SubWithNoActivation) {
+  FloatSubOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({-2.1, 0.0, 0.4, 0.3})));
+}
+
+class FloatDivOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloatDivOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_DIV, BuiltinOptions_DivOptions,
+                 CreateMulOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, DivWithNoActivation) {
+  FloatDivOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.8, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.4, 0.2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({-20, 1, 2, 4})));
+}
+
+class BaseConcatenationOpModel : public SingleOpModelWithNNAPI {
+ public:
+  BaseConcatenationOpModel() {}
+  BaseConcatenationOpModel(const TensorData& input_template, int axis,
+                           int num_inputs) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+ protected:
+  int output_;
+};
+
+class ConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  void SetInput(int index, std::initializer_list<float> data) {
+    PopulateTensor(index, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+TEST(NNAPIDelegate, ConcatenationThreeDimensionalOneInput) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
+                          /*num_inputs=*/1);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
+TEST(NNAPIDelegate, ConcatenationFourInputs) {
+  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
+                          /*num_inputs=*/4);
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              }));
+}
+
+class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  QuantizedConcatenationOpModel(const std::vector<TensorData>& input_template,
+                                int axis, int num_inputs,
+                                const TensorData& output_template) {
+    std::vector<std::vector<int>> all_input_shapes;
+    CHECK_EQ(input_template.size(), num_inputs);
+    for (int i = 0; i < num_inputs; ++i) {
+      all_input_shapes.push_back(input_template[i].shape);
+      AddInput(input_template[i]);
+    }
+    output_ = AddOutput({output_template.type, /*shape=*/{},
+                         output_template.min, output_template.max});
+    SetBuiltinOp(
+        BuiltinOperator_CONCATENATION, BuiltinOptions_ConcatenationOptions,
+        CreateConcatenationOptions(builder_, axis, ActivationFunctionType_NONE)
+            .Union());
+    BuildInterpreter(all_input_shapes);
+  }
+  void SetInput(int index, std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(index, data);
+  }
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+};
+
+TEST(NNAPIDelegate, ConcatenationFourInputsQuantized) {
+  QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
+                                   /*axis=*/2,
+                                   /*num_inputs=*/4);
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+TEST(NNAPIDelegate, ConcatenationFourInputsQuantizedMixedRange) {
+  QuantizedConcatenationOpModel m0({{TensorType_UINT8, {2, 1, 2}, -10.7, 10.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 12.8},
+                                    {TensorType_UINT8, {2, 1, 2}, -11, 11.8},
+                                    {TensorType_UINT8, {2, 1, 2}, 0, 7.4}},
+                                   /*axis=*/2, /*num_inputs=*/4,
+                                   {TensorType_UINT8, {2, 1, 2}, -12.7, 12.8});
+
+  m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
+  m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
+  m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
+  m0.SetInput(3, {1.3f, 3.3f, 4.3f, 7.3f});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({
+                  1.0f, 3.0f, 1.1f, 3.1f, 1.2f, 3.2f, 1.3f, 3.3f,  //
+                  4.0f, 7.0f, 4.1f, 7.1f, 4.2f, 7.2f, 4.3f, 7.3f,  //
+              })));
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  137, 157, 138, 158, 139, 159, 140, 160,  //
+                                  167, 197, 168, 198, 169, 199, 170, 200,  //
+                              }));
+}
+
+class DequantizeOpModel : public SingleOpModelWithNNAPI {
+ public:
+  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
+    input_ = AddInput({TensorType_UINT8, shape, min, max});
+    output_ = AddOutput({TensorType_FLOAT32, shape});
+    SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
+                 CreateDequantizeOptions(builder_).Union());
+
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  void SetInput(std::initializer_list<uint8_t> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, DequantizeFourDimensional) {
+  DequantizeOpModel m({2, 5}, -63.5, 64);
+
+  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+class FloorOpModel : public SingleOpModelWithNNAPI {
+ public:
+  FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_FLOOR, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, FloorSingleDim) {
+  FloorOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(NNAPIDelegate, FloorMultiDims) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+class LocalResponseNormOpModel : public SingleOpModelWithNNAPI {
+ public:
+  LocalResponseNormOpModel(std::initializer_list<int> input_shape, int radius,
+                           float bias, float alpha, float beta) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
+                 BuiltinOptions_LocalResponseNormalizationOptions,
+                 CreateLocalResponseNormalizationOptions(builder_, radius, bias,
+                                                         alpha, beta)
+                     .Union());
+    BuildInterpreter({input_shape});
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, LocalResponseNormSameAsL2Norm) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/1.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 2.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormWithAlpha) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/0.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 3.
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {-0.275, 0.15, 0.175, 0.3, -0.175, 0.025})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormWithBias) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/20, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  // The result is every input divided by 5.
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear({-0.22, 0.12, 0.14, 0.24, -0.14, 0.02})));
+}
+
+TEST(NNAPIDelegate, LocalResponseNormSmallRadius) {
+  LocalResponseNormOpModel m({1, 1, 1, 6}, /*radius=*/2, /*bias=*/9.0,
+                             /*alpha=*/4.0, /*beta=*/0.5);
+  m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {-0.264926, 0.125109, 0.140112, 0.267261, -0.161788, 0.0244266})));
+}
+
+class LSHProjectionOpModel : public SingleOpModelWithNNAPI {
+ public:
+  LSHProjectionOpModel(LSHProjectionType type,
+                       std::initializer_list<int> hash_shape,
+                       std::initializer_list<int> input_shape,
+                       std::initializer_list<int> weight_shape) {
+    hash_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(TensorType_INT32);
+    if (weight_shape.size() > 0) {
+      weight_ = AddInput(TensorType_FLOAT32);
+    }
+    output_ = AddOutput(TensorType_INT32);
+
+    SetBuiltinOp(BuiltinOperator_LSH_PROJECTION,
+                 BuiltinOptions_LSHProjectionOptions,
+                 CreateLSHProjectionOptions(builder_, type).Union());
+    if (weight_shape.size() > 0) {
+      BuildInterpreter({hash_shape, input_shape, weight_shape});
+    } else {
+      BuildInterpreter({hash_shape, input_shape});
+    }
+
+    output_size_ = 1;
+    for (int i : hash_shape) {
+      output_size_ *= i;
+      if (type == LSHProjectionType_SPARSE) {
+        break;
+      }
+    }
+  }
+  void SetInput(std::initializer_list<int> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetHash(std::initializer_list<float> data) {
+    PopulateTensor(hash_, data);
+  }
+
+  void SetWeight(std::initializer_list<float> f) { PopulateTensor(weight_, f); }
+
+  std::vector<int> GetOutput() { return ExtractVector<int>(output_); }
+
+ private:
+  int input_;
+  int hash_;
+  int weight_;
+  int output_;
+
+  int output_size_;
+};
+
+TEST(NNAPIDelegate, LSHProjectionDense1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_DENSE, {3, 2}, {5}, {5});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({1.0, 1.0, 1.0, 1.0, 1.0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0, 0, 0, 1, 0, 0));
+}
+
+TEST(NNAPIDelegate, LSHProjectionSparse1DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5}, {});
+
+  m.SetInput({12345, 54321, 67890, 9876, -12345678});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 0, 4 + 1, 8 + 0));
+}
+
+TEST(NNAPIDelegate, LSHProjectionSparse3DInputs) {
+  LSHProjectionOpModel m(LSHProjectionType_SPARSE, {3, 2}, {5, 2, 2}, {5});
+
+  m.SetInput({1234, 2345, 3456, 1234, 4567, 5678, 6789, 4567, 7891, 8912,
+              9123, 7890, -987, -876, -765, -987, -543, -432, -321, -543});
+  m.SetHash({0.123, 0.456, -0.321, 1.234, 5.678, -4.321});
+  m.SetWeight({0.12, 0.34, 0.56, 0.67, 0.78});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAre(0 + 2, 4 + 1, 8 + 1));
+}
+
+class BaseActivationsOpModel : public SingleOpModelWithNNAPI {
+ public:
+  // Most activations don't take any options, so this constructor works for
+  // them.
+  BaseActivationsOpModel(BuiltinOperator type, TensorData input) {
+    input_ = AddInput(input);
+    if (input.type == TensorType_UINT8) {
+      output_ = AddOutput({input.type, {}, 0, 0, 1. / 256});
+    } else {
+      output_ = AddOutput({input.type, {}});
+    }
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  BaseActivationsOpModel(BuiltinOperator type, const TensorData& input,
+                         const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(type, BuiltinOptions_NONE, 0);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+class FloatActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+const float kQuantizedTolerance = 2 * (1. / 256);
+
+class QuantizedActivationsOpModel : public BaseActivationsOpModel {
+ public:
+  using BaseActivationsOpModel::BaseActivationsOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+};
+
+TEST(NNAPIDelegate, Relu) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,   //
+                                 3, 0, 10, 1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Relu1) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU_N1_TO_1,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0.0, -0.6, 0.2, -0.4,  //
+      0.3, -2.0, 1.1, -0.1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0, -0.6, 0.2, -0.4,  //
+                                 0.3, -1.0, 1.0, -0.1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Relu6) {
+  FloatActivationsOpModel m(BuiltinOperator_RELU6,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 2, 4,  //
+                                 3, 0, 6, 1,  //
+                             }));
+}
+
+TEST(NNAPIDelegate, Tanh) {
+  FloatActivationsOpModel m(BuiltinOperator_TANH,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0, -0.9999877, 0.9640275, 0.999329,    //
+                                 0.99505475, -0.9640275, 1, 0.7615941,  //
+                             })));
+}
+
+TEST(NNAPIDelegate, LogisticFloat) {
+  FloatActivationsOpModel m(BuiltinOperator_LOGISTIC,
+                            /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}});
+  m.SetInput({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
+                                 0.5, 0.002473, 0.880797, 0.982014,       //
+                                 0.952574, 0.119203, 0.999955, 0.731059,  //
+                             })));
+}
+
+TEST(NNAPIDelegate, LogisticQuantized) {
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, -10, 10});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.5, 0.002473, 0.880797, 0.982014,       //
+                      0.952574, 0.119203, 0.999955, 0.731059,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 1, 227, 251, 244, 32, 255, 188}));
+}
+
+#if 0
+class ResizeBilinearOpModel : public SingleOpModelWithNNAPI {
+ public:
+  ResizeBilinearOpModel(const TensorData& input,
+                        std::initializer_list<int> size_data = {}) {
+    bool const_size = size_data.size() != 0;
+    input_ = AddInput(input);
+    if (const_size) {
+      size_ = AddConstInput(TensorType_INT32, size_data, {2});
+    } else {
+      size_ = AddInput({TensorType_INT32, {2}});
+    }
+    output_ = AddOutput(input.type);
+    SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
+                 BuiltinOptions_ResizeBilinearOptions,
+                 CreateResizeBilinearOptions(builder_).Union());
+    if (const_size) {
+      BuildInterpreter({GetShape(input_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_)});
+    }
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ private:
+  int input_;
+  int size_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, ResizeBilinearHorizontal) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
+  m.SetInput<float>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<float>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(NNAPIDelegate, ResizeBilinearVertical) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
+  m.SetInput<float>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<float>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(NNAPIDelegate, ResizeBilinearTwoDimensional) {
+  ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
+  m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<float>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+#endif
+
+template <typename T>
+class PadOpModel : public SingleOpModelWithNNAPI {
+ public:
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
+  void SetQuantizedPadValue(float data) {
+    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+  }
+
+  void SetPaddings(std::initializer_list<int> paddings) {
+    PopulateTensor<int>(paddings_, paddings);
+  }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+ protected:
+  int input_;
+  int output_;
+  int paddings_;
+  int constant_values_;
+};
+
+class PadOpConstModel : public PadOpModel<float> {
+ public:
+  PadOpConstModel(const TensorData& input,
+                  std::initializer_list<int> paddings_shape,
+                  std::initializer_list<int> paddings,
+                  const TensorData& output) {
+    input_ = AddInput(input);
+    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                 CreatePadOptions(builder_).Union());
+    BuildInterpreter({input.shape});
+  }
+};
+
+TEST(NNAPIDelegate, PadAdvancedConstTest) {
+  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+class SpaceToBatchNDOpModel : public SingleOpModelWithNNAPI {
+ public:
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetBlockShape(std::initializer_list<int> data) {
+    PopulateTensor<int>(block_shape_, data);
+  }
+
+  void SetPaddings(std::initializer_list<int> data) {
+    PopulateTensor<int>(paddings_, data);
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int block_shape_;
+  int paddings_;
+  int output_;
+};
+
+class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
+ public:
+  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+                             std::initializer_list<int> block_shape,
+                             std::initializer_list<int> paddings) {
+    input_ = AddInput(TensorType_FLOAT32);
+    block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
+    paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
+    output_ = AddOutput(TensorType_FLOAT32);
+
+    SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
+                 BuiltinOptions_SpaceToBatchNDOptions,
+                 CreateSpaceToBatchNDOptions(builder_).Union());
+    BuildInterpreter({input_shape});
+  }
+};
+
+TEST(NNAPIDelegate, SpaceToBatchNDSimpleConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDMultipleInputBatchesConstTest) {
+  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 9, 11, 2, 4, 10, 12, 5, 7,
+                                               13, 15, 6, 8, 14, 16}));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDSimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 5, 0, 0, 0, 6, 0, 1, 0, 7,
+                                 0, 2, 0, 8, 0, 3, 0, 9, 0, 4, 0, 10,
+                             }));
+}
+
+TEST(NNAPIDelegate, SpaceToBatchNDComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0,
+                                 0, 1, 0, 0, 0, 7, 0, 0, 0, 2, 0, 0, 0, 8, 0, 0,
+                                 0, 3, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0,
+                             }));
+}
+
+template <typename input_type = float,
+          TensorType tensor_input_type = TensorType_FLOAT32>
+class StridedSliceOpModel : public SingleOpModelWithNNAPI {
+ public:
+  StridedSliceOpModel(std::initializer_list<int> input_shape,
+                      std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> end_shape,
+                      std::initializer_list<int> strides_shape, int begin_mask,
+                      int end_mask, int ellipsis_mask, int new_axis_mask,
+                      int shrink_axis_mask) {
+    input_ = AddInput(tensor_input_type);
+    begin_ = AddInput(TensorType_INT32);
+    end_ = AddInput(TensorType_INT32);
+    strides_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(tensor_input_type);
+    SetBuiltinOp(
+        BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
+        CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
+                                  new_axis_mask, shrink_axis_mask)
+            .Union());
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+  }
+
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
+  }
+  void SetBegin(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(begin_, data);
+  }
+  void SetEnd(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(end_, data);
+  }
+  void SetStrides(std::initializer_list<int32_t> data) {
+    PopulateTensor<int32_t>(strides_, data);
+  }
+
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int end_;
+  int strides_;
+  int output_;
+};
+
+TEST(NNAPIDelegate, StridedSliceIn2D) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetEnd({2, 2});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxis_NegativeSlice) {
+  // This is equivalent to tf.range(4)[:, tf.newaxis][-2, -1].
+  StridedSliceOpModel<> m({4, 1}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({0, 1, 2, 3});
+  m.SetBegin({-2, -1});
+  m.SetEnd({-1, 0});
+  m.SetStrides({1, 1});
+
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2}));
+}
+
+TEST(NNAPIDelegate, StridedSliceIn2D_ShrinkAxisMask) {
+  StridedSliceOpModel<> m({2, 3}, {2}, {2}, {2}, 0, 0, 0, 0, 3);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({0, 0});
+  m.SetEnd({1, 1});
+  m.SetStrides({1, 1});
+  m.Invoke();
+  EXPECT_TRUE(m.GetOutputShape().empty());
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/examples/android/app/README.md b/tensorflow/contrib/lite/examples/android/app/README.md
index 8e12bd04dd0517b229265bec15981b6eea2345df..cbdeeac8790d93210a6c637953605b4ca270d3f6 100644
--- a/tensorflow/contrib/lite/examples/android/app/README.md
+++ b/tensorflow/contrib/lite/examples/android/app/README.md
@@ -2,9 +2,9 @@
 
 ## Building from Source with Bazel
 
-1. Follow the [Bazel steps for the TF Demo App](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#bazel).
+1. Install [Bazel](https://docs.bazel.build/versions/master/install.html), the Android NDK and SDK. The recommended versions are specified on this [webpage](https://www.tensorflow.org/mobile/tflite/demo_android#build_tensorflow_lite_and_the_demo_app_from_source).
 
-2. Build the app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
+2. Build this demo app with Bazel. The demo needs C++11. We configure the fat_apk_cpu flag to package support for 4 hardware variants. You may replace it with --config=android_arm64 on a 64-bit device and --config=android_arm for 32-bit device:
 
   ```shell
   bazel build -c opt --cxxopt='--std=c++11' --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
diff --git a/tensorflow/contrib/lite/experimental/c/BUILD b/tensorflow/contrib/lite/experimental/c/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..50f8da66d06abaf0637866e85c04e80fee042071
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/BUILD
@@ -0,0 +1,59 @@
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow/contrib/lite:build_def.bzl",
+    "tflite_cc_shared_object",
+    "tflite_copts",
+    "tflite_jni_binary",
+)
+
+tflite_cc_shared_object(
+    name = "libtensorflowlite_c.so",
+    linkopts = select({
+        "//tensorflow:darwin": [
+            "-Wl,-exported_symbols_list",  # This line must be directly followed by the exported_symbols.lds file
+            "$(location //tensorflow/contrib/lite/experimental/c:exported_symbols.lds)",
+            "-Wl,-install_name,@rpath/libtensorflowlite_c.so",
+        ],
+        "//tensorflow:windows": [],
+        "//conditions:default": [
+            "-z defs",
+            "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
+            "$(location //tensorflow/contrib/lite/experimental/c:version_script.lds)",
+        ],
+    }),
+    deps = [
+        ":c_api",
+        ":exported_symbols.lds",
+        ":version_script.lds",
+    ],
+)
+
+cc_library(
+    name = "c_api",
+    srcs = ["c_api.cc"],
+    hdrs = ["c_api.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/contrib/lite:context",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ],
+)
+
+cc_test(
+    name = "c_api_test",
+    size = "small",
+    srcs = ["c_api_test.cc"],
+    data = ["//tensorflow/contrib/lite:testdata/add.bin"],
+    deps = [
+        ":c_api",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.cc b/tensorflow/contrib/lite/experimental/c/c_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9d29e8b3e055e86a9e68285d81de742e36452215
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api.cc
@@ -0,0 +1,122 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct _TFL_Interpreter {
+  std::unique_ptr<tflite::Interpreter> impl;
+};
+
+// LINT.IfChange
+
+TFL_Interpreter* TFL_NewInterpreter(const void* model_data,
+                                    int32_t model_size) {
+  auto model = tflite::FlatBufferModel::BuildFromBuffer(
+      static_cast<const char*>(model_data), static_cast<size_t>(model_size));
+  if (!model) {
+    return nullptr;
+  }
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter_impl;
+  if (builder(&interpreter_impl) != kTfLiteOk) {
+    return nullptr;
+  }
+
+  return new TFL_Interpreter{std::move(interpreter_impl)};
+}
+
+void TFL_DeleteInterpreter(TFL_Interpreter* interpreter) { delete interpreter; }
+
+int32_t TFL_InterpreterGetInputTensorCount(const TFL_Interpreter* interpreter) {
+  return static_cast<int>(interpreter->impl->inputs().size());
+}
+
+TFL_Tensor* TFL_InterpreterGetInputTensor(const TFL_Interpreter* interpreter,
+                                          int32_t input_index) {
+  return interpreter->impl->tensor(interpreter->impl->inputs()[input_index]);
+}
+
+TFL_Status TFL_InterpreterResizeInputTensor(TFL_Interpreter* interpreter,
+                                            int32_t input_index,
+                                            const int* input_dims,
+                                            int32_t input_dims_size) {
+  std::vector<int> dims{input_dims, input_dims + input_dims_size};
+  return interpreter->impl->ResizeInputTensor(
+      interpreter->impl->inputs()[input_index], dims);
+}
+
+TFL_Status TFL_InterpreterAllocateTensors(TFL_Interpreter* interpreter) {
+  return interpreter->impl->AllocateTensors();
+}
+
+TFL_Status TFL_InterpreterInvoke(TFL_Interpreter* interpreter) {
+  return interpreter->impl->Invoke();
+}
+
+int32_t TFL_InterpreterGetOutputTensorCount(
+    const TFL_Interpreter* interpreter) {
+  return static_cast<int>(interpreter->impl->outputs().size());
+}
+
+const TFL_Tensor* TFL_InterpreterGetOutputTensor(
+    const TFL_Interpreter* interpreter, int32_t output_index) {
+  return interpreter->impl->tensor(interpreter->impl->outputs()[output_index]);
+}
+
+TFL_Type TFL_TensorType(const TFL_Tensor* tensor) { return tensor->type; }
+
+int32_t TFL_TensorNumDims(const TFL_Tensor* tensor) {
+  return tensor->dims->size;
+}
+
+int32_t TFL_TensorDim(const TFL_Tensor* tensor, int32_t dim_index) {
+  return tensor->dims->data[dim_index];
+}
+
+size_t TFL_TensorByteSize(const TFL_Tensor* tensor) { return tensor->bytes; }
+
+TFL_Status TFL_TensorCopyFromBuffer(TFL_Tensor* tensor, const void* input_data,
+                                    int32_t input_data_size) {
+  if (tensor->bytes != static_cast<size_t>(input_data_size)) {
+    return kTfLiteError;
+  }
+  memcpy(tensor->data.raw, input_data, input_data_size);
+  return kTfLiteOk;
+}
+
+TFL_Status TFL_TensorCopyToBuffer(const TFL_Tensor* tensor, void* output_data,
+                                  int32_t output_data_size) {
+  if (tensor->bytes != static_cast<size_t>(output_data_size)) {
+    return kTfLiteError;
+  }
+  memcpy(output_data, tensor->data.raw, output_data_size);
+  return kTfLiteOk;
+}
+
+// LINT.ThenChange(//tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/experimental/c/c_api.h b/tensorflow/contrib/lite/experimental/c/c_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..070f1add13c9904e1a2b3736001ada0e274fdc55
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api.h
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+#define TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
+
+#include <stdint.h>
+
+// Eventually the various C APIs defined in context.h will be migrated into
+// the appropriate /c/c_api*.h header. For now, we pull in existing definitions
+// for convenience.
+#include "tensorflow/contrib/lite/context.h"
+
+// --------------------------------------------------------------------------
+// Experimental C API for TensorFlowLite.
+//
+// The API leans towards simplicity and uniformity instead of convenience, as
+// most usage will be by language-specific wrappers.
+//
+// Conventions:
+// * We use the prefix TFL_ for everything in the API.
+
+#ifdef SWIG
+#define TFL_CAPI_EXPORT
+#else
+#if defined(_WIN32)
+#ifdef TF_COMPILE_LIBRARY
+#define TFL_CAPI_EXPORT __declspec(dllexport)
+#else
+#define TFL_CAPI_EXPORT __declspec(dllimport)
+#endif  // TF_COMPILE_LIBRARY
+#else
+#define TFL_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // _WIN32
+#endif  // SWIG
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef TfLiteTensor TFL_Tensor;
+typedef TfLiteStatus TFL_Status;
+typedef TfLiteType TFL_Type;
+
+// --------------------------------------------------------------------------
+// TFL_Interpreter provides inference from a provided model.
+typedef struct _TFL_Interpreter TFL_Interpreter;
+
+// Returns an interpreter for the provided model, or null on failure.
+//
+// NOTE: The client *must* explicitly allocate tensors before attempting to
+// access input tensor data or invoke the interpreter.
+TFL_CAPI_EXPORT extern TFL_Interpreter* TFL_NewInterpreter(
+    const void* model_data, int32_t model_size);
+
+// Destroys the interpreter.
+TFL_CAPI_EXPORT extern void TFL_DeleteInterpreter(TFL_Interpreter* interpreter);
+
+// Returns the number of input tensors associated with the model.
+TFL_CAPI_EXPORT extern int TFL_InterpreterGetInputTensorCount(
+    const TFL_Interpreter* interpreter);
+
+// Returns the tensor associated with the input index.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TFL_Tensor* TFL_InterpreterGetInputTensor(
+    const TFL_Interpreter* interpreter, int32_t input_index);
+
+// Attempts to resize the specified input tensor.
+// NOTE: After a resize, the client *must* explicitly allocate tensors before
+// attempting to access the resized tensor data or invoke the interpreter.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetInputTensorCount(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterResizeInputTensor(
+    TFL_Interpreter* interpreter, int32_t input_index, const int* input_dims,
+    int32_t input_dims_size);
+
+// Updates allocations for all tensors, resizing dependent tensors using the
+// specified input tensor dimensionality.
+//
+// This is a relatively expensive operation, and need only be called after
+// creating the graph and/or resizing any inputs.
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterAllocateTensors(
+    TFL_Interpreter* interpreter);
+
+// Runs inference for the loaded graph.
+//
+// NOTE: It is possible that the interpreter is not in a ready state to
+// evaluate (e.g., if a ResizeInputTensor() has been performed without a call to
+// AllocateTensors()).
+TFL_CAPI_EXPORT extern TFL_Status TFL_InterpreterInvoke(
+    TFL_Interpreter* interpreter);
+
+// Returns the number of output tensors associated with the model.
+TFL_CAPI_EXPORT extern int32_t TFL_InterpreterGetOutputTensorCount(
+    const TFL_Interpreter* interpreter);
+
+// Returns the tensor associated with the output index.
+// REQUIRES: 0 <= input_index < TFL_InterpreterGetOutputTensorCount(tensor)
+TFL_CAPI_EXPORT extern const TFL_Tensor* TFL_InterpreterGetOutputTensor(
+    const TFL_Interpreter* interpreter, int32_t output_index);
+
+// --------------------------------------------------------------------------
+// TFL_Tensor wraps data associated with a graph tensor.
+//
+// Note that, while the TFL_Tensor struct is not currently opaque, and its
+// fields can be accessed directly, these methods are still convenient for
+// language bindings. In the future the tensor struct will likely be made opaque
+// in the public API.
+
+// Returns the type of a tensor element.
+TFL_CAPI_EXPORT extern TFL_Type TFL_TensorType(const TFL_Tensor* tensor);
+
+// Returns the number of dimensions that the tensor has.
+TFL_CAPI_EXPORT extern int32_t TFL_TensorNumDims(const TFL_Tensor* tensor);
+
+// Returns the length of the tensor in the "dim_index" dimension.
+// REQUIRES: 0 <= dim_index < TFLiteTensorNumDims(tensor)
+TFL_CAPI_EXPORT extern int32_t TFL_TensorDim(const TFL_Tensor* tensor,
+                                             int32_t dim_index);
+
+// Returns the size of the underlying data in bytes.
+TFL_CAPI_EXPORT extern size_t TFL_TensorByteSize(const TFL_Tensor* tensor);
+
+// Copies from the provided input buffer into the tensor's buffer.
+// REQUIRES: input_data_size == TFL_TensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyFromBuffer(
+    TFL_Tensor* tensor, const void* input_data, int32_t input_data_size);
+
+// Copies to the provided output buffer from the tensor's buffer.
+// REQUIRES: output_data_size == TFL_TensorByteSize(tensor)
+TFL_CAPI_EXPORT extern TFL_Status TFL_TensorCopyToBuffer(
+    const TFL_Tensor* output_tensor, void* output_data,
+    int32_t output_data_size);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_CONTRIB_LITE_EXPERIMENTAL_C_C_API_H_
diff --git a/tensorflow/contrib/lite/experimental/c/c_api_test.cc b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc925e00a6096c5e8abcc0fa68b335c4db4401c3
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/c_api_test.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/contrib/lite/experimental/c/c_api.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace {
+
+TEST(CApiSimple, Smoke) {
+  tflite::FileCopyAllocation model_file(
+      "tensorflow/contrib/lite/testdata/add.bin",
+      tflite::DefaultErrorReporter());
+
+  TFL_Interpreter* interpreter =
+      TFL_NewInterpreter(model_file.base(), model_file.bytes());
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterGetInputTensorCount(interpreter), 1);
+  ASSERT_EQ(TFL_InterpreterGetOutputTensorCount(interpreter), 1);
+
+  std::array<int, 1> input_dims = {2};
+  ASSERT_EQ(TFL_InterpreterResizeInputTensor(interpreter, 0, input_dims.data(),
+                                             input_dims.size()),
+            kTfLiteOk);
+  ASSERT_EQ(TFL_InterpreterAllocateTensors(interpreter), kTfLiteOk);
+
+  TFL_Tensor* input_tensor = TFL_InterpreterGetInputTensor(interpreter, 0);
+  ASSERT_NE(input_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(input_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(input_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(input_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(input_tensor), sizeof(float) * 2);
+
+  std::array<float, 2> input = {1.f, 3.f};
+  ASSERT_EQ(TFL_TensorCopyFromBuffer(input_tensor, input.data(),
+                                     input.size() * sizeof(float)),
+            kTfLiteOk);
+
+  ASSERT_EQ(TFL_InterpreterInvoke(interpreter), kTfLiteOk);
+
+  const TFL_Tensor* output_tensor =
+      TFL_InterpreterGetOutputTensor(interpreter, 0);
+  ASSERT_NE(output_tensor, nullptr);
+  EXPECT_EQ(TFL_TensorType(output_tensor), kTfLiteFloat32);
+  EXPECT_EQ(TFL_TensorNumDims(output_tensor), 1);
+  EXPECT_EQ(TFL_TensorDim(output_tensor, 0), 2);
+  EXPECT_EQ(TFL_TensorByteSize(output_tensor), sizeof(float) * 2);
+
+  std::array<float, 2> output;
+  ASSERT_EQ(TFL_TensorCopyToBuffer(output_tensor, output.data(),
+                                   output.size() * sizeof(float)),
+            kTfLiteOk);
+  EXPECT_EQ(output[0], 3.f);
+  EXPECT_EQ(output[1], 9.f);
+
+  TFL_DeleteInterpreter(interpreter);
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/experimental/c/exported_symbols.lds b/tensorflow/contrib/lite/experimental/c/exported_symbols.lds
new file mode 100644
index 0000000000000000000000000000000000000000..a3ddc6bc8d370b1715fb1ebf2a66122296330249
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/exported_symbols.lds
@@ -0,0 +1 @@
+_TFL_*
diff --git a/tensorflow/contrib/lite/experimental/c/version_script.lds b/tensorflow/contrib/lite/experimental/c/version_script.lds
new file mode 100644
index 0000000000000000000000000000000000000000..c0c8a2bca19afed186e6f8c72a58989a79c7b251
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/c/version_script.lds
@@ -0,0 +1,9 @@
+VERS_1.0 {
+  # Export symbols in c_api.h.
+  global:
+    *TFL_*;
+
+  # Hide everything else.
+  local:
+    *;
+};
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c72a5cae9ebfb15f60961fe25e622663cad89a41
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/.gitignore
@@ -0,0 +1,13 @@
+# Unity generated
+Builds/
+Temp/
+Library/
+obj/
+# Visual Studio / MonoDevelop generated
+*.csproj
+*.unityproj
+*.sln
+*.suo
+*.userprefs
+# OS generated
+.DS_Store
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
new file mode 100644
index 0000000000000000000000000000000000000000..ed9337b53e880b62f70953f197613dcb1409d208
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 71d1b4219b1da4aeaa1cebbec324fc81
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
new file mode 100644
index 0000000000000000000000000000000000000000..edcce00939a298683b15ea45a5ec92709c6abc4f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d948aead14abd4c88947c9886d16f774
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
new file mode 100644
index 0000000000000000000000000000000000000000..36b35516f0cee064c8d8e4814a2ae515e28590ce
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: b810b85b794fa48fd93100acf5525e1f
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
new file mode 100644
index 0000000000000000000000000000000000000000..d4133da49a88d38a57d074d28b903f9f18102413
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 154f4201e2e454d4696fa5834eaa3ad3
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
new file mode 100644
index 0000000000000000000000000000000000000000..9397d8f27a6cce6013203afc8acc3266a429825d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity
@@ -0,0 +1,242 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!29 &1
+OcclusionCullingSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_OcclusionBakeSettings:
+    smallestOccluder: 5
+    smallestHole: 0.25
+    backfaceThreshold: 100
+  m_SceneGUID: 00000000000000000000000000000000
+  m_OcclusionCullingData: {fileID: 0}
+--- !u!104 &2
+RenderSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 8
+  m_Fog: 0
+  m_FogColor: {r: 0.5, g: 0.5, b: 0.5, a: 1}
+  m_FogMode: 3
+  m_FogDensity: 0.01
+  m_LinearFogStart: 0
+  m_LinearFogEnd: 300
+  m_AmbientSkyColor: {r: 0.212, g: 0.227, b: 0.259, a: 1}
+  m_AmbientEquatorColor: {r: 0.114, g: 0.125, b: 0.133, a: 1}
+  m_AmbientGroundColor: {r: 0.047, g: 0.043, b: 0.035, a: 1}
+  m_AmbientIntensity: 1
+  m_AmbientMode: 3
+  m_SubtractiveShadowColor: {r: 0.42, g: 0.478, b: 0.627, a: 1}
+  m_SkyboxMaterial: {fileID: 0}
+  m_HaloStrength: 0.5
+  m_FlareStrength: 1
+  m_FlareFadeSpeed: 3
+  m_HaloTexture: {fileID: 0}
+  m_SpotCookie: {fileID: 10001, guid: 0000000000000000e000000000000000, type: 0}
+  m_DefaultReflectionMode: 0
+  m_DefaultReflectionResolution: 128
+  m_ReflectionBounces: 1
+  m_ReflectionIntensity: 1
+  m_CustomReflection: {fileID: 0}
+  m_Sun: {fileID: 0}
+  m_IndirectSpecularColor: {r: 0, g: 0, b: 0, a: 1}
+--- !u!157 &3
+LightmapSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 11
+  m_GIWorkflowMode: 1
+  m_GISettings:
+    serializedVersion: 2
+    m_BounceScale: 1
+    m_IndirectOutputScale: 1
+    m_AlbedoBoost: 1
+    m_TemporalCoherenceThreshold: 1
+    m_EnvironmentLightingMode: 0
+    m_EnableBakedLightmaps: 0
+    m_EnableRealtimeLightmaps: 0
+  m_LightmapEditorSettings:
+    serializedVersion: 9
+    m_Resolution: 2
+    m_BakeResolution: 40
+    m_TextureWidth: 1024
+    m_TextureHeight: 1024
+    m_AO: 0
+    m_AOMaxDistance: 1
+    m_CompAOExponent: 1
+    m_CompAOExponentDirect: 0
+    m_Padding: 2
+    m_LightmapParameters: {fileID: 0}
+    m_LightmapsBakeMode: 1
+    m_TextureCompression: 1
+    m_FinalGather: 0
+    m_FinalGatherFiltering: 1
+    m_FinalGatherRayCount: 256
+    m_ReflectionCompression: 2
+    m_MixedBakeMode: 2
+    m_BakeBackend: 0
+    m_PVRSampling: 1
+    m_PVRDirectSampleCount: 32
+    m_PVRSampleCount: 500
+    m_PVRBounces: 2
+    m_PVRFilterTypeDirect: 0
+    m_PVRFilterTypeIndirect: 0
+    m_PVRFilterTypeAO: 0
+    m_PVRFilteringMode: 1
+    m_PVRCulling: 1
+    m_PVRFilteringGaussRadiusDirect: 1
+    m_PVRFilteringGaussRadiusIndirect: 5
+    m_PVRFilteringGaussRadiusAO: 2
+    m_PVRFilteringAtrousPositionSigmaDirect: 0.5
+    m_PVRFilteringAtrousPositionSigmaIndirect: 2
+    m_PVRFilteringAtrousPositionSigmaAO: 1
+    m_ShowResolutionOverlay: 1
+  m_LightingDataAsset: {fileID: 0}
+  m_UseShadowmask: 1
+--- !u!196 &4
+NavMeshSettings:
+  serializedVersion: 2
+  m_ObjectHideFlags: 0
+  m_BuildSettings:
+    serializedVersion: 2
+    agentTypeID: 0
+    agentRadius: 0.5
+    agentHeight: 2
+    agentSlope: 45
+    agentClimb: 0.4
+    ledgeDropHeight: 0
+    maxJumpAcrossDistance: 0
+    minRegionArea: 2
+    manualCellSize: 0
+    cellSize: 0.16666667
+    manualTileSize: 0
+    tileSize: 256
+    accuratePlacement: 0
+    debug:
+      m_Flags: 0
+  m_NavMeshData: {fileID: 0}
+--- !u!1 &492081941
+GameObject:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  serializedVersion: 5
+  m_Component:
+  - component: {fileID: 492081945}
+  - component: {fileID: 492081944}
+  - component: {fileID: 492081943}
+  - component: {fileID: 492081942}
+  m_Layer: 0
+  m_Name: Main Camera
+  m_TagString: MainCamera
+  m_Icon: {fileID: 0}
+  m_NavMeshLayer: 0
+  m_StaticEditorFlags: 0
+  m_IsActive: 1
+--- !u!81 &492081942
+AudioListener:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+--- !u!124 &492081943
+Behaviour:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+--- !u!20 &492081944
+Camera:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_Enabled: 1
+  serializedVersion: 2
+  m_ClearFlags: 1
+  m_BackGroundColor: {r: 0.19215687, g: 0.3019608, b: 0.4745098, a: 0}
+  m_NormalizedViewPortRect:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  near clip plane: 0.3
+  far clip plane: 1000
+  field of view: 60
+  orthographic: 1
+  orthographic size: 5
+  m_Depth: -1
+  m_CullingMask:
+    serializedVersion: 2
+    m_Bits: 4294967295
+  m_RenderingPath: -1
+  m_TargetTexture: {fileID: 0}
+  m_TargetDisplay: 0
+  m_TargetEye: 3
+  m_HDR: 1
+  m_AllowMSAA: 1
+  m_AllowDynamicResolution: 0
+  m_ForceIntoRT: 0
+  m_OcclusionCulling: 1
+  m_StereoConvergence: 10
+  m_StereoSeparation: 0.022
+--- !u!4 &492081945
+Transform:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 492081941}
+  m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+  m_LocalPosition: {x: 0, y: 0, z: -10}
+  m_LocalScale: {x: 1, y: 1, z: 1}
+  m_Children:
+  - {fileID: 904015944}
+  m_Father: {fileID: 0}
+  m_RootOrder: 0
+  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!1 &904015943
+GameObject:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  serializedVersion: 5
+  m_Component:
+  - component: {fileID: 904015944}
+  - component: {fileID: 904015945}
+  m_Layer: 0
+  m_Name: HelloTFLite
+  m_TagString: Untagged
+  m_Icon: {fileID: 0}
+  m_NavMeshLayer: 0
+  m_StaticEditorFlags: 0
+  m_IsActive: 1
+--- !u!4 &904015944
+Transform:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 904015943}
+  m_LocalRotation: {x: 0, y: 0, z: 0, w: 1}
+  m_LocalPosition: {x: 0, y: 0, z: 0}
+  m_LocalScale: {x: 1, y: 1, z: 1}
+  m_Children: []
+  m_Father: {fileID: 492081945}
+  m_RootOrder: 0
+  m_LocalEulerAnglesHint: {x: 0, y: 0, z: 0}
+--- !u!114 &904015945
+MonoBehaviour:
+  m_ObjectHideFlags: 0
+  m_PrefabParentObject: {fileID: 0}
+  m_PrefabInternal: {fileID: 0}
+  m_GameObject: {fileID: 904015943}
+  m_Enabled: 1
+  m_EditorHideFlags: 0
+  m_Script: {fileID: 11500000, guid: 899510441e0ca4be0879d3055e467878, type: 3}
+  m_Name: 
+  m_EditorClassIdentifier: 
+  model: {fileID: 4900000, guid: adff4e1dbdba344c199ee4fe7e84457e, type: 3}
+  inputs:
+  - 1
+  - 3
+  - 7
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
new file mode 100644
index 0000000000000000000000000000000000000000..e1e13efb66027b555f1d45c76fe58fe2103774a2
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/HelloTFLite.unity.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: f8a8c37a396584bb7b21687f33d6d3f8
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes
new file mode 100644
index 0000000000000000000000000000000000000000..aef0fe3d82c9d92dc444076d3b46e05af1923f46
Binary files /dev/null and b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes differ
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
new file mode 100644
index 0000000000000000000000000000000000000000..ba24871413e06154afd0c0d5e2db83b7619d34a9
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scenes/add.bytes.meta
@@ -0,0 +1,7 @@
+fileFormatVersion: 2
+guid: adff4e1dbdba344c199ee4fe7e84457e
+TextScriptImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
new file mode 100644
index 0000000000000000000000000000000000000000..28fde68b8b1346e88375dc7a8613270f0e2f2762
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: f7d1e2dec09b64acdb7b8f5aef9fcb44
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
new file mode 100644
index 0000000000000000000000000000000000000000..abca8144998367eadaeb0b75d85bb0f6cf3a2057
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs
@@ -0,0 +1,70 @@
+﻿/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Linq;
+using TensorFlowLite;
+using UnityEngine;
+
+/// <summary>
+/// Simple example demonstrating use of the experimental C# bindings for TensorFlowLite.
+/// </summary>
+public class HelloTFLite : MonoBehaviour {
+
+  [Tooltip("Configurable TFLite model.")]
+  public TextAsset model;
+
+  [Tooltip("Configurable TFLite input tensor data.")]
+  public float[] inputs;
+
+  private Interpreter interpreter;
+  private float[] outputs;
+
+  void Start () {
+    interpreter = new Interpreter(model.bytes);
+    Debug.LogFormat("InputCount: {0}, OutputCount: {1}",
+                    interpreter.GetInputTensorCount(),
+                    interpreter.GetOutputTensorCount());
+  }
+
+  void Update () {
+    if (inputs == null) {
+      return;
+    }
+
+    if (outputs == null || outputs.Length != inputs.Length) {
+      interpreter.ResizeInputTensor(0, new int[]{inputs.Length});
+      interpreter.AllocateTensors();
+      outputs = new float[inputs.Length];
+    }
+
+    interpreter.SetInputTensorData(0, inputs);
+    interpreter.Invoke();
+    interpreter.GetOutputTensorData(0, outputs);
+
+    Debug.LogFormat("Input: {0}, Output: {1}",
+                    ArrayToString(inputs),
+                    ArrayToString(outputs));
+  }
+
+  void OnDestroy() {
+    interpreter.Dispose();
+  }
+
+   private static string ArrayToString(float[] values) {
+    return string.Join(",", values.Select(x => x.ToString()).ToArray());
+  }
+}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
new file mode 100644
index 0000000000000000000000000000000000000000..ba83f45084bb624e5e7777684b0fda98b4d46688
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/Examples/HelloTFLite/Scripts/HelloTFLite.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 899510441e0ca4be0879d3055e467878
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ce15c6a6932398d798d193b54f4ecfd8ba2d8
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: 16dad1655bcdc48f7b325a2a634b9c69
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
new file mode 100644
index 0000000000000000000000000000000000000000..22ed2c466bde1668595967f7a07f34a9193aaec8
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts.meta
@@ -0,0 +1,8 @@
+fileFormatVersion: 2
+guid: d70863368f8904d509a9b73d3a555914
+folderAsset: yes
+DefaultImporter:
+  externalObjects: {}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
new file mode 100644
index 0000000000000000000000000000000000000000..ab966bae2efb9431e2f9f35dc818d130aabd71f6
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs
@@ -0,0 +1,145 @@
+﻿/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+using System;
+using System.Runtime.InteropServices;
+
+using TFL_Interpreter = System.IntPtr;
+using TFL_Tensor = System.IntPtr;
+
+namespace TensorFlowLite
+{
+  /// <summary>
+  /// Simple C# bindings for the experimental TensorFlowLite C API.
+  /// </summary>
+  public class Interpreter : IDisposable
+  {
+    private const string TensorFlowLibrary = "tensorflowlite_c";
+
+    private TFL_Interpreter handle;
+
+    public Interpreter(byte[] modelData) {
+      GCHandle modelDataHandle = GCHandle.Alloc(modelData, GCHandleType.Pinned);
+      IntPtr modelDataPtr = modelDataHandle.AddrOfPinnedObject();
+      handle = TFL_NewInterpreter(modelDataPtr, modelData.Length);
+      if (handle == IntPtr.Zero) throw new Exception("Failed to create TensorFlowLite Interpreter");
+    }
+
+    ~Interpreter() {
+      Dispose();
+    }
+
+    public void Dispose() {
+      if (handle != IntPtr.Zero) TFL_DeleteInterpreter(handle);
+      handle = IntPtr.Zero;
+    }
+
+    public void Invoke() {
+      ThrowIfError(TFL_InterpreterInvoke(handle));
+    }
+
+    public int GetInputTensorCount() {
+      return TFL_InterpreterGetInputTensorCount(handle);
+    }
+
+    public void SetInputTensorData(int inputTensorIndex, Array inputTensorData) {
+      GCHandle tensorDataHandle = GCHandle.Alloc(inputTensorData, GCHandleType.Pinned);
+      IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
+      TFL_Tensor tensor = TFL_InterpreterGetInputTensor(handle, inputTensorIndex);
+      ThrowIfError(TFL_TensorCopyFromBuffer(
+          tensor, tensorDataPtr, Buffer.ByteLength(inputTensorData)));
+    }
+
+    public void ResizeInputTensor(int inputTensorIndex, int[] inputTensorShape) {
+      ThrowIfError(TFL_InterpreterResizeInputTensor(
+          handle, inputTensorIndex, inputTensorShape, inputTensorShape.Length));
+    }
+
+    public void AllocateTensors() {
+      ThrowIfError(TFL_InterpreterAllocateTensors(handle));
+    }
+
+    public int GetOutputTensorCount() {
+      return TFL_InterpreterGetOutputTensorCount(handle);
+    }
+
+    public void GetOutputTensorData(int outputTensorIndex, Array outputTensorData) {
+      GCHandle tensorDataHandle = GCHandle.Alloc(outputTensorData, GCHandleType.Pinned);
+      IntPtr tensorDataPtr = tensorDataHandle.AddrOfPinnedObject();
+      TFL_Tensor tensor = TFL_InterpreterGetOutputTensor(handle, outputTensorIndex);
+      ThrowIfError(TFL_TensorCopyToBuffer(
+          tensor, tensorDataPtr, Buffer.ByteLength(outputTensorData)));
+    }
+
+    private static void ThrowIfError(int resultCode) {
+      if (resultCode != 0) throw new Exception("TensorFlowLite operation failed.");
+    }
+
+    #region Externs
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Interpreter TFL_NewInterpreter(
+        IntPtr model_data,
+        int model_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe void TFL_DeleteInterpreter(TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterGetInputTensorCount(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Tensor TFL_InterpreterGetInputTensor(
+        TFL_Interpreter interpreter,
+        int input_index);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterResizeInputTensor(
+        TFL_Interpreter interpreter,
+        int input_index,
+        int[] input_dims,
+        int input_dims_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterAllocateTensors(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterInvoke(TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_InterpreterGetOutputTensorCount(
+        TFL_Interpreter interpreter);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe TFL_Tensor TFL_InterpreterGetOutputTensor(
+        TFL_Interpreter interpreter,
+        int output_index);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_TensorCopyFromBuffer(
+        TFL_Tensor tensor,
+        IntPtr input_data,
+        int input_data_size);
+
+    [DllImport (TensorFlowLibrary)]
+    private static extern unsafe int TFL_TensorCopyToBuffer(
+        TFL_Tensor tensor,
+        IntPtr output_data,
+        int output_data_size);
+
+    #endregion
+  }
+}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
new file mode 100644
index 0000000000000000000000000000000000000000..5ec84ef7f70e9be45ff6292ed7a412fac35010de
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs.meta
@@ -0,0 +1,11 @@
+fileFormatVersion: 2
+guid: 0bbaf59e6ac914ed1b28174fb9008a09
+MonoImporter:
+  externalObjects: {}
+  serializedVersion: 2
+  defaultReferences: []
+  executionOrder: 0
+  icon: {instanceID: 0}
+  userData: 
+  assetBundleName: 
+  assetBundleVariant: 
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..da6112576a5ca4290108f6d4c731bd4c391e91d4
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/AudioManager.asset
@@ -0,0 +1,17 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!11 &1
+AudioManager:
+  m_ObjectHideFlags: 0
+  m_Volume: 1
+  Rolloff Scale: 1
+  Doppler Factor: 1
+  Default Speaker Mode: 2
+  m_SampleRate: 0
+  m_DSPBufferSize: 0
+  m_VirtualVoiceCount: 512
+  m_RealVoiceCount: 32
+  m_SpatializerPlugin: 
+  m_AmbisonicDecoderPlugin: 
+  m_DisableAudio: 0
+  m_VirtualizeEffects: 1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..e7886b266a005f4d9d80f2fef8d1649dcfd3ed2b
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ClusterInputManager.asset
@@ -0,0 +1,6 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!236 &1
+ClusterInputManager:
+  m_ObjectHideFlags: 0
+  m_Inputs: []
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..78992f08c7ab7a4353c8a7d07cf1548174aaacbf
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/DynamicsManager.asset
@@ -0,0 +1,29 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!55 &1
+PhysicsManager:
+  m_ObjectHideFlags: 0
+  serializedVersion: 7
+  m_Gravity: {x: 0, y: -9.81, z: 0}
+  m_DefaultMaterial: {fileID: 0}
+  m_BounceThreshold: 2
+  m_SleepThreshold: 0.005
+  m_DefaultContactOffset: 0.01
+  m_DefaultSolverIterations: 6
+  m_DefaultSolverVelocityIterations: 1
+  m_QueriesHitBackfaces: 0
+  m_QueriesHitTriggers: 1
+  m_EnableAdaptiveForce: 0
+  m_ClothInterCollisionDistance: 0
+  m_ClothInterCollisionStiffness: 0
+  m_ContactsGeneration: 1
+  m_LayerCollisionMatrix: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
+  m_AutoSimulation: 1
+  m_AutoSyncTransforms: 1
+  m_ClothInterCollisionSettingsToggle: 0
+  m_ContactPairsMode: 0
+  m_BroadphaseType: 0
+  m_WorldBounds:
+    m_Center: {x: 0, y: 0, z: 0}
+    m_Extent: {x: 250, y: 250, z: 250}
+  m_WorldSubdivisions: 8
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..6dc24f7dfdb697ad6f5d0a4ec5599bcd3cbd2f43
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorBuildSettings.asset
@@ -0,0 +1,7 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!1045 &1
+EditorBuildSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_Scenes: []
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..fcd016402f97e4c009a16640517a6930ed615ef9
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/EditorSettings.asset
@@ -0,0 +1,21 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!159 &1
+EditorSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 7
+  m_ExternalVersionControlSupport: Visible Meta Files
+  m_SerializationMode: 2
+  m_LineEndingsForNewScripts: 1
+  m_DefaultBehaviorMode: 1
+  m_SpritePackerMode: 4
+  m_SpritePackerPaddingPower: 1
+  m_EtcTextureCompressorBehavior: 1
+  m_EtcTextureFastCompressor: 1
+  m_EtcTextureNormalCompressor: 2
+  m_EtcTextureBestCompressor: 4
+  m_ProjectGenerationIncludedExtensions: txt;xml;fnt;cd;asmdef;rsp
+  m_ProjectGenerationRootNamespace: 
+  m_UserGeneratedProjectSuffix: 
+  m_CollabEditorSettings:
+    inProgressEnabled: 1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..74d7b532b092680d2b87092007e38f2cbc6b3a00
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/GraphicsSettings.asset
@@ -0,0 +1,61 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!30 &1
+GraphicsSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 12
+  m_Deferred:
+    m_Mode: 1
+    m_Shader: {fileID: 69, guid: 0000000000000000f000000000000000, type: 0}
+  m_DeferredReflections:
+    m_Mode: 1
+    m_Shader: {fileID: 74, guid: 0000000000000000f000000000000000, type: 0}
+  m_ScreenSpaceShadows:
+    m_Mode: 1
+    m_Shader: {fileID: 64, guid: 0000000000000000f000000000000000, type: 0}
+  m_LegacyDeferred:
+    m_Mode: 1
+    m_Shader: {fileID: 63, guid: 0000000000000000f000000000000000, type: 0}
+  m_DepthNormals:
+    m_Mode: 1
+    m_Shader: {fileID: 62, guid: 0000000000000000f000000000000000, type: 0}
+  m_MotionVectors:
+    m_Mode: 1
+    m_Shader: {fileID: 75, guid: 0000000000000000f000000000000000, type: 0}
+  m_LightHalo:
+    m_Mode: 1
+    m_Shader: {fileID: 105, guid: 0000000000000000f000000000000000, type: 0}
+  m_LensFlare:
+    m_Mode: 1
+    m_Shader: {fileID: 102, guid: 0000000000000000f000000000000000, type: 0}
+  m_AlwaysIncludedShaders:
+  - {fileID: 7, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15104, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15105, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 15106, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 10753, guid: 0000000000000000f000000000000000, type: 0}
+  - {fileID: 10770, guid: 0000000000000000f000000000000000, type: 0}
+  m_PreloadedShaders: []
+  m_SpritesDefaultMaterial: {fileID: 10754, guid: 0000000000000000f000000000000000,
+    type: 0}
+  m_CustomRenderPipeline: {fileID: 0}
+  m_TransparencySortMode: 0
+  m_TransparencySortAxis: {x: 0, y: 0, z: 1}
+  m_DefaultRenderingPath: 1
+  m_DefaultMobileRenderingPath: 1
+  m_TierSettings: []
+  m_LightmapStripping: 0
+  m_FogStripping: 0
+  m_InstancingStripping: 0
+  m_LightmapKeepPlain: 1
+  m_LightmapKeepDirCombined: 1
+  m_LightmapKeepDynamicPlain: 1
+  m_LightmapKeepDynamicDirCombined: 1
+  m_LightmapKeepShadowMask: 1
+  m_LightmapKeepSubtractive: 1
+  m_FogKeepLinear: 1
+  m_FogKeepExp: 1
+  m_FogKeepExp2: 1
+  m_AlbedoSwatchInfos: []
+  m_LightsUseLinearIntensity: 0
+  m_LightsUseColorTemperature: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..17c8f538e2152c0a0310b4870979eeecece2153c
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/InputManager.asset
@@ -0,0 +1,295 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!13 &1
+InputManager:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  m_Axes:
+  - serializedVersion: 3
+    m_Name: Horizontal
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: left
+    positiveButton: right
+    altNegativeButton: a
+    altPositiveButton: d
+    gravity: 3
+    dead: 0.001
+    sensitivity: 3
+    snap: 1
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Vertical
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: down
+    positiveButton: up
+    altNegativeButton: s
+    altPositiveButton: w
+    gravity: 3
+    dead: 0.001
+    sensitivity: 3
+    snap: 1
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire1
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left ctrl
+    altNegativeButton: 
+    altPositiveButton: mouse 0
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire2
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left alt
+    altNegativeButton: 
+    altPositiveButton: mouse 1
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire3
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: left shift
+    altNegativeButton: 
+    altPositiveButton: mouse 2
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Jump
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: space
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse X
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse Y
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 1
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Mouse ScrollWheel
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0
+    sensitivity: 0.1
+    snap: 0
+    invert: 0
+    type: 1
+    axis: 2
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Horizontal
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0.19
+    sensitivity: 1
+    snap: 0
+    invert: 0
+    type: 2
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Vertical
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: 
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 0
+    dead: 0.19
+    sensitivity: 1
+    snap: 0
+    invert: 1
+    type: 2
+    axis: 1
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire1
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 0
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire2
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 1
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Fire3
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 2
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Jump
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: joystick button 3
+    altNegativeButton: 
+    altPositiveButton: 
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Submit
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: return
+    altNegativeButton: 
+    altPositiveButton: joystick button 0
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Submit
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: enter
+    altNegativeButton: 
+    altPositiveButton: space
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
+  - serializedVersion: 3
+    m_Name: Cancel
+    descriptiveName: 
+    descriptiveNegativeName: 
+    negativeButton: 
+    positiveButton: escape
+    altNegativeButton: 
+    altPositiveButton: joystick button 1
+    gravity: 1000
+    dead: 0.001
+    sensitivity: 1000
+    snap: 0
+    invert: 0
+    type: 0
+    axis: 0
+    joyNum: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
new file mode 100644
index 0000000000000000000000000000000000000000..3b0b7c3d183abdd300112f56965916ef11667f54
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NavMeshAreas.asset
@@ -0,0 +1,91 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!126 &1
+NavMeshProjectSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 2
+  areas:
+  - name: Walkable
+    cost: 1
+  - name: Not Walkable
+    cost: 1
+  - name: Jump
+    cost: 2
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  - name: 
+    cost: 1
+  m_LastAgentTypeID: -887442657
+  m_Settings:
+  - serializedVersion: 2
+    agentTypeID: 0
+    agentRadius: 0.5
+    agentHeight: 2
+    agentSlope: 45
+    agentClimb: 0.75
+    ledgeDropHeight: 0
+    maxJumpAcrossDistance: 0
+    minRegionArea: 2
+    manualCellSize: 0
+    cellSize: 0.16666667
+    manualTileSize: 0
+    tileSize: 256
+    accuratePlacement: 0
+    debug:
+      m_Flags: 0
+  m_SettingNames:
+  - Humanoid
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6a831d9f2a11f08ed96571e0f602e3c3908b5
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/NetworkManager.asset
@@ -0,0 +1,8 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!149 &1
+NetworkManager:
+  m_ObjectHideFlags: 0
+  m_DebugLevel: 0
+  m_Sendrate: 15
+  m_AssetToPrefab: {}
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..132ee6bc868f1aae138555dc139e054b0d1d8620
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/Physics2DSettings.asset
@@ -0,0 +1,37 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!19 &1
+Physics2DSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 3
+  m_Gravity: {x: 0, y: -9.81}
+  m_DefaultMaterial: {fileID: 0}
+  m_VelocityIterations: 8
+  m_PositionIterations: 3
+  m_VelocityThreshold: 1
+  m_MaxLinearCorrection: 0.2
+  m_MaxAngularCorrection: 8
+  m_MaxTranslationSpeed: 100
+  m_MaxRotationSpeed: 360
+  m_BaumgarteScale: 0.2
+  m_BaumgarteTimeOfImpactScale: 0.75
+  m_TimeToSleep: 0.5
+  m_LinearSleepTolerance: 0.01
+  m_AngularSleepTolerance: 2
+  m_DefaultContactOffset: 0.01
+  m_AutoSimulation: 1
+  m_QueriesHitTriggers: 1
+  m_QueriesStartInColliders: 1
+  m_ChangeStopsCallbacks: 0
+  m_CallbacksOnDisable: 1
+  m_AutoSyncTransforms: 1
+  m_AlwaysShowColliders: 0
+  m_ShowColliderSleep: 1
+  m_ShowColliderContacts: 0
+  m_ShowColliderAABB: 0
+  m_ContactArrowScale: 0.2
+  m_ColliderAwakeColor: {r: 0.5686275, g: 0.95686275, b: 0.54509807, a: 0.7529412}
+  m_ColliderAsleepColor: {r: 0.5686275, g: 0.95686275, b: 0.54509807, a: 0.36078432}
+  m_ColliderContactColor: {r: 1, g: 0, b: 1, a: 0.6862745}
+  m_ColliderAABBColor: {r: 1, g: 1, b: 0, a: 0.2509804}
+  m_LayerCollisionMatrix: ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..3fbfab76c13c84f66a166c5dfe1d4552503350ff
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectSettings.asset
@@ -0,0 +1,641 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!129 &1
+PlayerSettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 14
+  productGUID: a084943b991dd4597b140f4ce2b41c65
+  AndroidProfiler: 0
+  AndroidFilterTouchesWhenObscured: 0
+  defaultScreenOrientation: 4
+  targetDevice: 2
+  useOnDemandResources: 0
+  accelerometerFrequency: 60
+  companyName: DefaultCompany
+  productName: TensorFlowLitePlugin
+  defaultCursor: {fileID: 0}
+  cursorHotspot: {x: 0, y: 0}
+  m_SplashScreenBackgroundColor: {r: 0.13725491, g: 0.12156863, b: 0.1254902, a: 1}
+  m_ShowUnitySplashScreen: 1
+  m_ShowUnitySplashLogo: 1
+  m_SplashScreenOverlayOpacity: 1
+  m_SplashScreenAnimation: 1
+  m_SplashScreenLogoStyle: 1
+  m_SplashScreenDrawMode: 0
+  m_SplashScreenBackgroundAnimationZoom: 1
+  m_SplashScreenLogoAnimationZoom: 1
+  m_SplashScreenBackgroundLandscapeAspect: 1
+  m_SplashScreenBackgroundPortraitAspect: 1
+  m_SplashScreenBackgroundLandscapeUvs:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  m_SplashScreenBackgroundPortraitUvs:
+    serializedVersion: 2
+    x: 0
+    y: 0
+    width: 1
+    height: 1
+  m_SplashScreenLogos: []
+  m_VirtualRealitySplashScreen: {fileID: 0}
+  m_HolographicTrackingLossScreen: {fileID: 0}
+  defaultScreenWidth: 1024
+  defaultScreenHeight: 768
+  defaultScreenWidthWeb: 960
+  defaultScreenHeightWeb: 600
+  m_StereoRenderingPath: 0
+  m_ActiveColorSpace: 0
+  m_MTRendering: 1
+  m_StackTraceTypes: 010000000100000001000000010000000100000001000000
+  iosShowActivityIndicatorOnLoading: -1
+  androidShowActivityIndicatorOnLoading: -1
+  tizenShowActivityIndicatorOnLoading: -1
+  iosAppInBackgroundBehavior: 0
+  displayResolutionDialog: 1
+  iosAllowHTTPDownload: 1
+  allowedAutorotateToPortrait: 1
+  allowedAutorotateToPortraitUpsideDown: 1
+  allowedAutorotateToLandscapeRight: 1
+  allowedAutorotateToLandscapeLeft: 1
+  useOSAutorotation: 1
+  use32BitDisplayBuffer: 1
+  preserveFramebufferAlpha: 0
+  disableDepthAndStencilBuffers: 0
+  androidBlitType: 0
+  defaultIsFullScreen: 1
+  defaultIsNativeResolution: 1
+  macRetinaSupport: 1
+  runInBackground: 0
+  captureSingleScreen: 0
+  muteOtherAudioSources: 0
+  Prepare IOS For Recording: 0
+  Force IOS Speakers When Recording: 0
+  deferSystemGesturesMode: 0
+  hideHomeButton: 0
+  submitAnalytics: 1
+  usePlayerLog: 1
+  bakeCollisionMeshes: 0
+  forceSingleInstance: 0
+  resizableWindow: 0
+  useMacAppStoreValidation: 0
+  macAppStoreCategory: public.app-category.games
+  gpuSkinning: 0
+  graphicsJobs: 0
+  xboxPIXTextureCapture: 0
+  xboxEnableAvatar: 0
+  xboxEnableKinect: 0
+  xboxEnableKinectAutoTracking: 0
+  xboxEnableFitness: 0
+  visibleInBackground: 1
+  allowFullscreenSwitch: 1
+  graphicsJobMode: 0
+  macFullscreenMode: 2
+  d3d11FullscreenMode: 1
+  xboxSpeechDB: 0
+  xboxEnableHeadOrientation: 0
+  xboxEnableGuest: 0
+  xboxEnablePIXSampling: 0
+  metalFramebufferOnly: 0
+  n3dsDisableStereoscopicView: 0
+  n3dsEnableSharedListOpt: 1
+  n3dsEnableVSync: 0
+  xboxOneResolution: 0
+  xboxOneSResolution: 0
+  xboxOneXResolution: 3
+  xboxOneMonoLoggingLevel: 0
+  xboxOneLoggingLevel: 1
+  xboxOneDisableEsram: 0
+  xboxOnePresentImmediateThreshold: 0
+  videoMemoryForVertexBuffers: 0
+  psp2PowerMode: 0
+  psp2AcquireBGM: 1
+  wiiUTVResolution: 0
+  wiiUGamePadMSAA: 1
+  wiiUSupportsNunchuk: 0
+  wiiUSupportsClassicController: 0
+  wiiUSupportsBalanceBoard: 0
+  wiiUSupportsMotionPlus: 0
+  wiiUSupportsProController: 0
+  wiiUAllowScreenCapture: 1
+  wiiUControllerCount: 0
+  m_SupportedAspectRatios:
+    4:3: 1
+    5:4: 1
+    16:10: 1
+    16:9: 1
+    Others: 1
+  bundleVersion: 1.0
+  preloadedAssets: []
+  metroInputSource: 0
+  wsaTransparentSwapchain: 0
+  m_HolographicPauseOnTrackingLoss: 1
+  xboxOneDisableKinectGpuReservation: 0
+  xboxOneEnable7thCore: 0
+  vrSettings:
+    cardboard:
+      depthFormat: 0
+      enableTransitionView: 0
+    daydream:
+      depthFormat: 0
+      useSustainedPerformanceMode: 0
+      enableVideoLayer: 0
+      useProtectedVideoMemory: 0
+      minimumSupportedHeadTracking: 0
+      maximumSupportedHeadTracking: 1
+    hololens:
+      depthFormat: 1
+      depthBufferSharingEnabled: 0
+    oculus:
+      sharedDepthBuffer: 0
+      dashSupport: 0
+  protectGraphicsMemory: 0
+  useHDRDisplay: 0
+  m_ColorGamuts: 00000000
+  targetPixelDensity: 30
+  resolutionScalingMode: 0
+  androidSupportedAspectRatio: 1
+  androidMaxAspectRatio: 2.1
+  applicationIdentifier: {}
+  buildNumber: {}
+  AndroidBundleVersionCode: 1
+  AndroidMinSdkVersion: 16
+  AndroidTargetSdkVersion: 0
+  AndroidPreferredInstallLocation: 1
+  aotOptions: 
+  stripEngineCode: 1
+  iPhoneStrippingLevel: 0
+  iPhoneScriptCallOptimization: 0
+  ForceInternetPermission: 0
+  ForceSDCardPermission: 0
+  CreateWallpaper: 0
+  APKExpansionFiles: 0
+  keepLoadedShadersAlive: 0
+  StripUnusedMeshComponents: 0
+  VertexChannelCompressionMask:
+    serializedVersion: 2
+    m_Bits: 238
+  iPhoneSdkVersion: 988
+  iOSTargetOSVersionString: 7.0
+  tvOSSdkVersion: 0
+  tvOSRequireExtendedGameController: 0
+  tvOSTargetOSVersionString: 9.0
+  uIPrerenderedIcon: 0
+  uIRequiresPersistentWiFi: 0
+  uIRequiresFullScreen: 1
+  uIStatusBarHidden: 1
+  uIExitOnSuspend: 0
+  uIStatusBarStyle: 0
+  iPhoneSplashScreen: {fileID: 0}
+  iPhoneHighResSplashScreen: {fileID: 0}
+  iPhoneTallHighResSplashScreen: {fileID: 0}
+  iPhone47inSplashScreen: {fileID: 0}
+  iPhone55inPortraitSplashScreen: {fileID: 0}
+  iPhone55inLandscapeSplashScreen: {fileID: 0}
+  iPhone58inPortraitSplashScreen: {fileID: 0}
+  iPhone58inLandscapeSplashScreen: {fileID: 0}
+  iPadPortraitSplashScreen: {fileID: 0}
+  iPadHighResPortraitSplashScreen: {fileID: 0}
+  iPadLandscapeSplashScreen: {fileID: 0}
+  iPadHighResLandscapeSplashScreen: {fileID: 0}
+  appleTVSplashScreen: {fileID: 0}
+  appleTVSplashScreen2x: {fileID: 0}
+  tvOSSmallIconLayers: []
+  tvOSSmallIconLayers2x: []
+  tvOSLargeIconLayers: []
+  tvOSTopShelfImageLayers: []
+  tvOSTopShelfImageLayers2x: []
+  tvOSTopShelfImageWideLayers: []
+  tvOSTopShelfImageWideLayers2x: []
+  iOSLaunchScreenType: 0
+  iOSLaunchScreenPortrait: {fileID: 0}
+  iOSLaunchScreenLandscape: {fileID: 0}
+  iOSLaunchScreenBackgroundColor:
+    serializedVersion: 2
+    rgba: 0
+  iOSLaunchScreenFillPct: 100
+  iOSLaunchScreenSize: 100
+  iOSLaunchScreenCustomXibPath: 
+  iOSLaunchScreeniPadType: 0
+  iOSLaunchScreeniPadImage: {fileID: 0}
+  iOSLaunchScreeniPadBackgroundColor:
+    serializedVersion: 2
+    rgba: 0
+  iOSLaunchScreeniPadFillPct: 100
+  iOSLaunchScreeniPadSize: 100
+  iOSLaunchScreeniPadCustomXibPath: 
+  iOSUseLaunchScreenStoryboard: 0
+  iOSLaunchScreenCustomStoryboardPath: 
+  iOSDeviceRequirements: []
+  iOSURLSchemes: []
+  iOSBackgroundModes: 0
+  iOSMetalForceHardShadows: 0
+  metalEditorSupport: 1
+  metalAPIValidation: 1
+  iOSRenderExtraFrameOnPause: 0
+  appleDeveloperTeamID: 
+  iOSManualSigningProvisioningProfileID: 
+  tvOSManualSigningProvisioningProfileID: 
+  appleEnableAutomaticSigning: 0
+  clonedFromGUID: 00000000000000000000000000000000
+  AndroidTargetDevice: 0
+  AndroidSplashScreenScale: 0
+  androidSplashScreen: {fileID: 0}
+  AndroidKeystoreName: 
+  AndroidKeyaliasName: 
+  AndroidTVCompatibility: 1
+  AndroidIsGame: 1
+  AndroidEnableTango: 0
+  androidEnableBanner: 1
+  androidUseLowAccuracyLocation: 0
+  m_AndroidBanners:
+  - width: 320
+    height: 180
+    banner: {fileID: 0}
+  androidGamepadSupportLevel: 0
+  resolutionDialogBanner: {fileID: 0}
+  m_BuildTargetIcons: []
+  m_BuildTargetBatching: []
+  m_BuildTargetGraphicsAPIs: []
+  m_BuildTargetVRSettings: []
+  m_BuildTargetEnableVuforiaSettings: []
+  openGLRequireES31: 0
+  openGLRequireES31AEP: 0
+  m_TemplateCustomTags: {}
+  mobileMTRendering:
+    Android: 1
+    iPhone: 1
+    tvOS: 1
+  m_BuildTargetGroupLightmapEncodingQuality: []
+  wiiUTitleID: 0005000011000000
+  wiiUGroupID: 00010000
+  wiiUCommonSaveSize: 4096
+  wiiUAccountSaveSize: 2048
+  wiiUOlvAccessKey: 0
+  wiiUTinCode: 0
+  wiiUJoinGameId: 0
+  wiiUJoinGameModeMask: 0000000000000000
+  wiiUCommonBossSize: 0
+  wiiUAccountBossSize: 0
+  wiiUAddOnUniqueIDs: []
+  wiiUMainThreadStackSize: 3072
+  wiiULoaderThreadStackSize: 1024
+  wiiUSystemHeapSize: 128
+  wiiUTVStartupScreen: {fileID: 0}
+  wiiUGamePadStartupScreen: {fileID: 0}
+  wiiUDrcBufferDisabled: 0
+  wiiUProfilerLibPath: 
+  playModeTestRunnerEnabled: 0
+  actionOnDotNetUnhandledException: 1
+  enableInternalProfiler: 0
+  logObjCUncaughtExceptions: 1
+  enableCrashReportAPI: 0
+  cameraUsageDescription: 
+  locationUsageDescription: 
+  microphoneUsageDescription: 
+  switchNetLibKey: 
+  switchSocketMemoryPoolSize: 6144
+  switchSocketAllocatorPoolSize: 128
+  switchSocketConcurrencyLimit: 14
+  switchScreenResolutionBehavior: 2
+  switchUseCPUProfiler: 0
+  switchApplicationID: 0x01004b9000490000
+  switchNSODependencies: 
+  switchTitleNames_0: 
+  switchTitleNames_1: 
+  switchTitleNames_2: 
+  switchTitleNames_3: 
+  switchTitleNames_4: 
+  switchTitleNames_5: 
+  switchTitleNames_6: 
+  switchTitleNames_7: 
+  switchTitleNames_8: 
+  switchTitleNames_9: 
+  switchTitleNames_10: 
+  switchTitleNames_11: 
+  switchTitleNames_12: 
+  switchTitleNames_13: 
+  switchTitleNames_14: 
+  switchPublisherNames_0: 
+  switchPublisherNames_1: 
+  switchPublisherNames_2: 
+  switchPublisherNames_3: 
+  switchPublisherNames_4: 
+  switchPublisherNames_5: 
+  switchPublisherNames_6: 
+  switchPublisherNames_7: 
+  switchPublisherNames_8: 
+  switchPublisherNames_9: 
+  switchPublisherNames_10: 
+  switchPublisherNames_11: 
+  switchPublisherNames_12: 
+  switchPublisherNames_13: 
+  switchPublisherNames_14: 
+  switchIcons_0: {fileID: 0}
+  switchIcons_1: {fileID: 0}
+  switchIcons_2: {fileID: 0}
+  switchIcons_3: {fileID: 0}
+  switchIcons_4: {fileID: 0}
+  switchIcons_5: {fileID: 0}
+  switchIcons_6: {fileID: 0}
+  switchIcons_7: {fileID: 0}
+  switchIcons_8: {fileID: 0}
+  switchIcons_9: {fileID: 0}
+  switchIcons_10: {fileID: 0}
+  switchIcons_11: {fileID: 0}
+  switchIcons_12: {fileID: 0}
+  switchIcons_13: {fileID: 0}
+  switchIcons_14: {fileID: 0}
+  switchSmallIcons_0: {fileID: 0}
+  switchSmallIcons_1: {fileID: 0}
+  switchSmallIcons_2: {fileID: 0}
+  switchSmallIcons_3: {fileID: 0}
+  switchSmallIcons_4: {fileID: 0}
+  switchSmallIcons_5: {fileID: 0}
+  switchSmallIcons_6: {fileID: 0}
+  switchSmallIcons_7: {fileID: 0}
+  switchSmallIcons_8: {fileID: 0}
+  switchSmallIcons_9: {fileID: 0}
+  switchSmallIcons_10: {fileID: 0}
+  switchSmallIcons_11: {fileID: 0}
+  switchSmallIcons_12: {fileID: 0}
+  switchSmallIcons_13: {fileID: 0}
+  switchSmallIcons_14: {fileID: 0}
+  switchManualHTML: 
+  switchAccessibleURLs: 
+  switchLegalInformation: 
+  switchMainThreadStackSize: 1048576
+  switchPresenceGroupId: 
+  switchLogoHandling: 0
+  switchReleaseVersion: 0
+  switchDisplayVersion: 1.0.0
+  switchStartupUserAccount: 0
+  switchTouchScreenUsage: 0
+  switchSupportedLanguagesMask: 0
+  switchLogoType: 0
+  switchApplicationErrorCodeCategory: 
+  switchUserAccountSaveDataSize: 0
+  switchUserAccountSaveDataJournalSize: 0
+  switchApplicationAttribute: 0
+  switchCardSpecSize: -1
+  switchCardSpecClock: -1
+  switchRatingsMask: 0
+  switchRatingsInt_0: 0
+  switchRatingsInt_1: 0
+  switchRatingsInt_2: 0
+  switchRatingsInt_3: 0
+  switchRatingsInt_4: 0
+  switchRatingsInt_5: 0
+  switchRatingsInt_6: 0
+  switchRatingsInt_7: 0
+  switchRatingsInt_8: 0
+  switchRatingsInt_9: 0
+  switchRatingsInt_10: 0
+  switchRatingsInt_11: 0
+  switchLocalCommunicationIds_0: 
+  switchLocalCommunicationIds_1: 
+  switchLocalCommunicationIds_2: 
+  switchLocalCommunicationIds_3: 
+  switchLocalCommunicationIds_4: 
+  switchLocalCommunicationIds_5: 
+  switchLocalCommunicationIds_6: 
+  switchLocalCommunicationIds_7: 
+  switchParentalControl: 0
+  switchAllowsScreenshot: 1
+  switchAllowsVideoCapturing: 1
+  switchAllowsRuntimeAddOnContentInstall: 0
+  switchDataLossConfirmation: 0
+  switchSupportedNpadStyles: 3
+  switchSocketConfigEnabled: 0
+  switchTcpInitialSendBufferSize: 32
+  switchTcpInitialReceiveBufferSize: 64
+  switchTcpAutoSendBufferSizeMax: 256
+  switchTcpAutoReceiveBufferSizeMax: 256
+  switchUdpSendBufferSize: 9
+  switchUdpReceiveBufferSize: 42
+  switchSocketBufferEfficiency: 4
+  switchSocketInitializeEnabled: 1
+  switchNetworkInterfaceManagerInitializeEnabled: 1
+  switchPlayerConnectionEnabled: 1
+  ps4NPAgeRating: 12
+  ps4NPTitleSecret: 
+  ps4NPTrophyPackPath: 
+  ps4ParentalLevel: 11
+  ps4ContentID: ED1633-NPXX51362_00-0000000000000000
+  ps4Category: 0
+  ps4MasterVersion: 01.00
+  ps4AppVersion: 01.00
+  ps4AppType: 0
+  ps4ParamSfxPath: 
+  ps4VideoOutPixelFormat: 0
+  ps4VideoOutInitialWidth: 1920
+  ps4VideoOutBaseModeInitialWidth: 1920
+  ps4VideoOutReprojectionRate: 60
+  ps4PronunciationXMLPath: 
+  ps4PronunciationSIGPath: 
+  ps4BackgroundImagePath: 
+  ps4StartupImagePath: 
+  ps4StartupImagesFolder: 
+  ps4IconImagesFolder: 
+  ps4SaveDataImagePath: 
+  ps4SdkOverride: 
+  ps4BGMPath: 
+  ps4ShareFilePath: 
+  ps4ShareOverlayImagePath: 
+  ps4PrivacyGuardImagePath: 
+  ps4NPtitleDatPath: 
+  ps4RemotePlayKeyAssignment: -1
+  ps4RemotePlayKeyMappingDir: 
+  ps4PlayTogetherPlayerCount: 0
+  ps4EnterButtonAssignment: 1
+  ps4ApplicationParam1: 0
+  ps4ApplicationParam2: 0
+  ps4ApplicationParam3: 0
+  ps4ApplicationParam4: 0
+  ps4DownloadDataSize: 0
+  ps4GarlicHeapSize: 2048
+  ps4ProGarlicHeapSize: 2560
+  ps4Passcode: d3hjjul8UhK6ZnQCEBYYQPozR9sQV066
+  ps4pnSessions: 1
+  ps4pnPresence: 1
+  ps4pnFriends: 1
+  ps4pnGameCustomData: 1
+  playerPrefsSupport: 0
+  restrictedAudioUsageRights: 0
+  ps4UseResolutionFallback: 0
+  ps4ReprojectionSupport: 0
+  ps4UseAudio3dBackend: 0
+  ps4SocialScreenEnabled: 0
+  ps4ScriptOptimizationLevel: 0
+  ps4Audio3dVirtualSpeakerCount: 14
+  ps4attribCpuUsage: 0
+  ps4PatchPkgPath: 
+  ps4PatchLatestPkgPath: 
+  ps4PatchChangeinfoPath: 
+  ps4PatchDayOne: 0
+  ps4attribUserManagement: 0
+  ps4attribMoveSupport: 0
+  ps4attrib3DSupport: 0
+  ps4attribShareSupport: 0
+  ps4attribExclusiveVR: 0
+  ps4disableAutoHideSplash: 0
+  ps4videoRecordingFeaturesUsed: 0
+  ps4contentSearchFeaturesUsed: 0
+  ps4attribEyeToEyeDistanceSettingVR: 0
+  ps4IncludedModules: []
+  monoEnv: 
+  psp2Splashimage: {fileID: 0}
+  psp2NPTrophyPackPath: 
+  psp2NPSupportGBMorGJP: 0
+  psp2NPAgeRating: 12
+  psp2NPTitleDatPath: 
+  psp2NPCommsID: 
+  psp2NPCommunicationsID: 
+  psp2NPCommsPassphrase: 
+  psp2NPCommsSig: 
+  psp2ParamSfxPath: 
+  psp2ManualPath: 
+  psp2LiveAreaGatePath: 
+  psp2LiveAreaBackroundPath: 
+  psp2LiveAreaPath: 
+  psp2LiveAreaTrialPath: 
+  psp2PatchChangeInfoPath: 
+  psp2PatchOriginalPackage: 
+  psp2PackagePassword: 3onkgZsAECEn0fzCoWiCtWCKe4l74pE5
+  psp2KeystoneFile: 
+  psp2MemoryExpansionMode: 0
+  psp2DRMType: 0
+  psp2StorageType: 0
+  psp2MediaCapacity: 0
+  psp2DLCConfigPath: 
+  psp2ThumbnailPath: 
+  psp2BackgroundPath: 
+  psp2SoundPath: 
+  psp2TrophyCommId: 
+  psp2TrophyPackagePath: 
+  psp2PackagedResourcesPath: 
+  psp2SaveDataQuota: 10240
+  psp2ParentalLevel: 1
+  psp2ShortTitle: Not Set
+  psp2ContentID: IV0000-ABCD12345_00-0123456789ABCDEF
+  psp2Category: 0
+  psp2MasterVersion: 01.00
+  psp2AppVersion: 01.00
+  psp2TVBootMode: 0
+  psp2EnterButtonAssignment: 2
+  psp2TVDisableEmu: 0
+  psp2AllowTwitterDialog: 1
+  psp2Upgradable: 0
+  psp2HealthWarning: 0
+  psp2UseLibLocation: 0
+  psp2InfoBarOnStartup: 0
+  psp2InfoBarColor: 0
+  psp2ScriptOptimizationLevel: 0
+  psmSplashimage: {fileID: 0}
+  splashScreenBackgroundSourceLandscape: {fileID: 0}
+  splashScreenBackgroundSourcePortrait: {fileID: 0}
+  spritePackerPolicy: 
+  webGLMemorySize: 256
+  webGLExceptionSupport: 1
+  webGLNameFilesAsHashes: 0
+  webGLDataCaching: 0
+  webGLDebugSymbols: 0
+  webGLEmscriptenArgs: 
+  webGLModulesDirectory: 
+  webGLTemplate: APPLICATION:Default
+  webGLAnalyzeBuildSize: 0
+  webGLUseEmbeddedResources: 0
+  webGLUseWasm: 0
+  webGLCompressionFormat: 1
+  scriptingDefineSymbols: {}
+  platformArchitecture: {}
+  scriptingBackend: {}
+  incrementalIl2cppBuild: {}
+  additionalIl2CppArgs: 
+  scriptingRuntimeVersion: 0
+  apiCompatibilityLevelPerPlatform: {}
+  m_RenderingPath: 1
+  m_MobileRenderingPath: 1
+  metroPackageName: TensorFlowLitePlugin
+  metroPackageVersion: 
+  metroCertificatePath: 
+  metroCertificatePassword: 
+  metroCertificateSubject: 
+  metroCertificateIssuer: 
+  metroCertificateNotAfter: 0000000000000000
+  metroApplicationDescription: TensorFlowLitePlugin
+  wsaImages: {}
+  metroTileShortName: 
+  metroCommandLineArgsFile: 
+  metroTileShowName: 0
+  metroMediumTileShowName: 0
+  metroLargeTileShowName: 0
+  metroWideTileShowName: 0
+  metroDefaultTileSize: 1
+  metroTileForegroundText: 2
+  metroTileBackgroundColor: {r: 0.13333334, g: 0.17254902, b: 0.21568628, a: 0}
+  metroSplashScreenBackgroundColor: {r: 0.12941177, g: 0.17254902, b: 0.21568628,
+    a: 1}
+  metroSplashScreenUseBackgroundColor: 0
+  platformCapabilities: {}
+  metroFTAName: 
+  metroFTAFileTypes: []
+  metroProtocolName: 
+  metroCompilationOverrides: 1
+  tizenProductDescription: 
+  tizenProductURL: 
+  tizenSigningProfileName: 
+  tizenGPSPermissions: 0
+  tizenMicrophonePermissions: 0
+  tizenDeploymentTarget: 
+  tizenDeploymentTargetType: -1
+  tizenMinOSVersion: 1
+  n3dsUseExtSaveData: 0
+  n3dsCompressStaticMem: 1
+  n3dsExtSaveDataNumber: 0x12345
+  n3dsStackSize: 131072
+  n3dsTargetPlatform: 2
+  n3dsRegion: 7
+  n3dsMediaSize: 0
+  n3dsLogoStyle: 3
+  n3dsTitle: GameName
+  n3dsProductCode: 
+  n3dsApplicationId: 0xFF3FF
+  XboxOneProductId: 
+  XboxOneUpdateKey: 
+  XboxOneSandboxId: 
+  XboxOneContentId: 
+  XboxOneTitleId: 
+  XboxOneSCId: 
+  XboxOneGameOsOverridePath: 
+  XboxOnePackagingOverridePath: 
+  XboxOneAppManifestOverridePath: 
+  XboxOnePackageEncryption: 0
+  XboxOnePackageUpdateGranularity: 2
+  XboxOneDescription: 
+  XboxOneLanguage:
+  - enus
+  XboxOneCapability: []
+  XboxOneGameRating: {}
+  XboxOneIsContentPackage: 0
+  XboxOneEnableGPUVariability: 0
+  XboxOneSockets: {}
+  XboxOneSplashScreen: {fileID: 0}
+  XboxOneAllowedProductIds: []
+  XboxOnePersistentLocalStorageSize: 0
+  XboxOneXTitleMemory: 8
+  xboxOneScriptCompiler: 0
+  vrEditorSettings:
+    daydream:
+      daydreamIconForeground: {fileID: 0}
+      daydreamIconBackground: {fileID: 0}
+  cloudServicesEnabled: {}
+  facebookSdkVersion: 7.9.4
+  apiCompatibilityLevel: 2
+  cloudProjectId: 
+  projectName: 
+  organizationId: 
+  cloudEnabled: 0
+  enableNativePlatformBackendsForNewInputSystem: 0
+  disableOldInputManagerSupport: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4a9cfb61ab55abc2f0d09b0225a802ef8122eaaf
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/ProjectVersion.txt
@@ -0,0 +1 @@
+m_EditorVersion: 2017.4.6f1
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..05daac3c4922feef068af19efa921fcbb476afde
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/QualitySettings.asset
@@ -0,0 +1,191 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!47 &1
+QualitySettings:
+  m_ObjectHideFlags: 0
+  serializedVersion: 5
+  m_CurrentQuality: 5
+  m_QualitySettings:
+  - serializedVersion: 2
+    name: Very Low
+    pixelLightCount: 0
+    shadows: 0
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 15
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 1
+    textureQuality: 1
+    anisotropicTextures: 0
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 0
+    lodBias: 0.3
+    maximumLODLevel: 0
+    particleRaycastBudget: 4
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Low
+    pixelLightCount: 0
+    shadows: 0
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 20
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 0
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 0
+    lodBias: 0.4
+    maximumLODLevel: 0
+    particleRaycastBudget: 16
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Medium
+    pixelLightCount: 1
+    shadows: 1
+    shadowResolution: 0
+    shadowProjection: 1
+    shadowCascades: 1
+    shadowDistance: 20
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 0
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 1
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 0
+    realtimeReflectionProbes: 0
+    billboardsFaceCameraPosition: 0
+    vSyncCount: 1
+    lodBias: 0.7
+    maximumLODLevel: 0
+    particleRaycastBudget: 64
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: High
+    pixelLightCount: 2
+    shadows: 2
+    shadowResolution: 1
+    shadowProjection: 1
+    shadowCascades: 2
+    shadowDistance: 40
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 2
+    textureQuality: 0
+    anisotropicTextures: 1
+    antiAliasing: 0
+    softParticles: 0
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 1
+    maximumLODLevel: 0
+    particleRaycastBudget: 256
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Very High
+    pixelLightCount: 3
+    shadows: 2
+    shadowResolution: 2
+    shadowProjection: 1
+    shadowCascades: 2
+    shadowDistance: 70
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 4
+    textureQuality: 0
+    anisotropicTextures: 2
+    antiAliasing: 2
+    softParticles: 1
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 1.5
+    maximumLODLevel: 0
+    particleRaycastBudget: 1024
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  - serializedVersion: 2
+    name: Ultra
+    pixelLightCount: 4
+    shadows: 2
+    shadowResolution: 2
+    shadowProjection: 1
+    shadowCascades: 4
+    shadowDistance: 150
+    shadowNearPlaneOffset: 3
+    shadowCascade2Split: 0.33333334
+    shadowCascade4Split: {x: 0.06666667, y: 0.2, z: 0.46666667}
+    shadowmaskMode: 1
+    blendWeights: 4
+    textureQuality: 0
+    anisotropicTextures: 2
+    antiAliasing: 2
+    softParticles: 1
+    softVegetation: 1
+    realtimeReflectionProbes: 1
+    billboardsFaceCameraPosition: 1
+    vSyncCount: 1
+    lodBias: 2
+    maximumLODLevel: 0
+    particleRaycastBudget: 4096
+    asyncUploadTimeSlice: 2
+    asyncUploadBufferSize: 4
+    resolutionScalingFixedDPIFactor: 1
+    excludedTargetPlatforms: []
+  m_PerPlatformDefaultQuality:
+    Android: 2
+    Nintendo 3DS: 5
+    Nintendo Switch: 5
+    PS4: 5
+    PSM: 5
+    PSP2: 2
+    Standalone: 5
+    Tizen: 2
+    WebGL: 3
+    WiiU: 5
+    Windows Store Apps: 5
+    XboxOne: 5
+    iPhone: 2
+    tvOS: 2
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..1c92a7840ec11895c76785f65d949a3d20d53355
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TagManager.asset
@@ -0,0 +1,43 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!78 &1
+TagManager:
+  serializedVersion: 2
+  tags: []
+  layers:
+  - Default
+  - TransparentFX
+  - Ignore Raycast
+  - 
+  - Water
+  - UI
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  - 
+  m_SortingLayers:
+  - name: Default
+    uniqueID: 0
+    locked: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
new file mode 100644
index 0000000000000000000000000000000000000000..558a017e1f50b2db73414a1abad3c033922774f8
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/TimeManager.asset
@@ -0,0 +1,9 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!5 &1
+TimeManager:
+  m_ObjectHideFlags: 0
+  Fixed Timestep: 0.02
+  Maximum Allowed Timestep: 0.33333334
+  m_TimeScale: 1
+  Maximum Particle Timestep: 0.03
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
new file mode 100644
index 0000000000000000000000000000000000000000..3da14d5baf1fa24df1746c3ce9d969eda3a9c59d
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/ProjectSettings/UnityConnectSettings.asset
@@ -0,0 +1,34 @@
+%YAML 1.1
+%TAG !u! tag:unity3d.com,2011:
+--- !u!310 &1
+UnityConnectSettings:
+  m_ObjectHideFlags: 0
+  m_Enabled: 0
+  m_TestMode: 0
+  m_TestEventUrl: 
+  m_TestConfigUrl: 
+  m_TestInitMode: 0
+  CrashReportingSettings:
+    m_EventUrl: https://perf-events.cloud.unity3d.com/api/events/crashes
+    m_NativeEventUrl: https://perf-events.cloud.unity3d.com/symbolicate
+    m_Enabled: 0
+    m_CaptureEditorExceptions: 1
+  UnityPurchasingSettings:
+    m_Enabled: 0
+    m_TestMode: 0
+  UnityAnalyticsSettings:
+    m_Enabled: 0
+    m_InitializeOnStartup: 1
+    m_TestMode: 0
+    m_TestEventUrl: 
+    m_TestConfigUrl: 
+  UnityAdsSettings:
+    m_Enabled: 0
+    m_InitializeOnStartup: 1
+    m_TestMode: 0
+    m_IosGameId: 
+    m_AndroidGameId: 
+    m_GameIds: {}
+    m_GameId: 
+  PerformanceReportingSettings:
+    m_Enabled: 0
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b3813fccb10c3a89fb462f9ab6bb81c6a9a147a
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/README.md
@@ -0,0 +1,24 @@
+# TF Lite Experimental Unity Plugin
+
+This directoryy contains an experimental sample Unity (2017) Plugin, based on
+the experimental TF Lite C API. The sample demonstrates running inference within
+Unity by way of a C# `Interpreter` wrapper.
+
+Note that the native TF Lite plugin(s) *must* be built before using the Unity
+Plugin, and placed in Assets/TensorFlowLite/SDK/Plugins/. For the editor (note
+that this has only been tested on Linux; the syntax may differ on Mac/Windows):
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
+```
+
+and for Android:
+
+```sh
+bazel build -c opt --cxxopt=--std=c++11 \
+  --crosstool_top=//external:android/crosstool \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --cpu=armeabi-v7a \
+  //tensorflow/contrib/lite/experimental/c:libtensorflowlite_c.so
+```
diff --git a/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
new file mode 100644
index 0000000000000000000000000000000000000000..526aca60573f334a6b6bd536fa5be9c26d678e0f
--- /dev/null
+++ b/tensorflow/contrib/lite/experimental/examples/unity/TensorFlowLitePlugin/UnityPackageManager/manifest.json
@@ -0,0 +1,4 @@
+{
+	"dependencies": {
+	}
+}
diff --git a/tensorflow/contrib/lite/g3doc/README.md b/tensorflow/contrib/lite/g3doc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e3db4784815b7562588d3afbd34f837b101f0977
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/README.md
@@ -0,0 +1,4 @@
+This is a *work-in-progress* TF Lite subsite for:
+https://www.tensorflow.org/mobile
+
+DO NOT PUBLISH
diff --git a/tensorflow/contrib/lite/g3doc/_book.yaml b/tensorflow/contrib/lite/g3doc/_book.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98abd5743b2412399496f2fb3a70cd25d8597bca
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_book.yaml
@@ -0,0 +1,58 @@
+upper_tabs:
+# Tabs left of dropdown menu
+- include: /_upper_tabs_left.yaml
+# Dropdown menu
+- name: Ecosystem
+  path: /ecosystem
+  is_default: True
+  menu:
+  - include: /ecosystem/_menu_toc.yaml
+  lower_tabs:
+    # Subsite tabs
+    other:
+    - name: Guide
+      contents:
+      - title: Overview
+        path: /mobile/overview
+      - title: Developer Guide
+        path: /mobile/devguide
+      - title: Android Demo App
+        path: /mobile/demo_android
+      - title: iOS Demo App
+        path: /mobile/demo_ios
+      - title: Performance
+        path: /mobile/performance
+      - break: True
+      - title: TensorFlow Lite APIs
+        path: /mobile/apis
+      - title: Custom operators
+        path: /mobile/custom_operators
+      - title: TensorFlow Lite Ops Versioning
+        path: /mobile/ops_versioning
+      - title: TensorFlow Lite Compatibility Guide
+        path: /mobile/tf_ops_compatibility
+      - title: List of Hosted Models
+        path: /mobile/models
+      - title: TensorFlow Lite for iOS
+        path: /mobile/ios
+      - title: TensorFlow Lite for Raspberry Pi
+        path: /mobile/rpi
+
+      - heading: TF Mobile
+        status: deprecated
+      - title: Overview
+        path: /mobile/tfmobile/
+      - title: Building TensorFlow on Android
+        path: /mobile/tfmobile/android_build
+      - title: Building TensorFlow on IOS
+        path: /mobile/tfmobile/ios_build
+      - title: Integrating TensorFlow libraries
+        path: /mobile/tfmobile/linking_libs
+      - title: Preparing models for mobile deployment
+        path: /mobile/tfmobile/prepare_models
+      - title: Optimizing for mobile
+        path: /mobile/tfmobile/optimizing
+
+    - name: API
+      contents:
+      - include: /mobile/api_docs/python/_toc.yaml
diff --git a/tensorflow/contrib/lite/g3doc/_index.yaml b/tensorflow/contrib/lite/g3doc/_index.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..9119e49117ffbda268f36324072d30ffd83c9e6c
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_index.yaml
@@ -0,0 +1,67 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+description: <!--no description-->
+landing_page:
+  rows:
+  - heading: TensorFlow Lite is a lightweight solution for mobile and embedded devices.
+    items:
+    - description: >
+        TensorFlow Lite is TensorFlow’s lightweight solution for mobile and
+        embedded devices. It enables on-device machine learning inference with
+        low latency and a small binary size. TensorFlow Lite also supports
+        hardware acceleration with the
+        <a href='https://developer.android.com/ndk/guides/neuralnetworks/index.html'>Android Neural Networks API</a>.
+      list:
+      - heading: Key point 1
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+      - heading: Key point 2
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+      - heading: Key point 3
+        description: >
+          [high-level overview]
+        icon:
+          icon_name: chevron_right
+          foreground: theme
+          background: grey
+    - code_block: |
+        <pre class = "prettyprint">
+        $ toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
+               --input_format=TENSORFLOW_GRAPHDEF \
+               --output_format=TFLITE \
+               --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
+               --inference_type=FLOAT \
+               --input_type=FLOAT \
+               --input_arrays=input \
+               --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+               --input_shapes=1,224,224,3
+        </pre>
+
+  - classname: devsite-landing-row-cards
+    items:
+    - heading: Using TensorFlow Lite on Android
+      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+      buttons:
+      - label: Read on TensorFlow blog
+        path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
+    - heading: TensorFlow Lite at the Dev Summit
+      youtube_id: FAMfy7izB6A
+      buttons:
+      - label: Watch the video
+        path: https://www.youtube.com/watch?v=FAMfy7izB6A
+    - heading: TensorFlow Lite on GitHub
+      image_path: /ecosystem/images/github-card-16x9.png
+      path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
+      buttons:
+      - label: View on GitHub
+        path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite
diff --git a/tensorflow/contrib/lite/g3doc/_project.yaml b/tensorflow/contrib/lite/g3doc/_project.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b39666516baab42d289e4d40077c2877ed65d396
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/_project.yaml
@@ -0,0 +1,10 @@
+name: TensorFlow Lite
+breadcrumb_name: Mobile
+home_url: /mobile/
+parent_project_metadata_path: /_project.yaml
+description: >
+  TensorFlow Lite is a lightweight solution for mobile and embedded devices.
+use_site_branding: True
+hide_from_products_list: True
+content_license: cc3-apache2
+buganizer_id: 316308
diff --git a/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml b/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1e1c44c6929571144d8cf0b54463c48e37466022
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/api_docs/python/_toc.yaml
@@ -0,0 +1,6 @@
+# Automatically generated file; please do not edit
+toc:
+  - title: TensorFlow Lite
+    section:
+    - title: Overview
+      path: /mobile/api_docs/python/
diff --git a/tensorflow/contrib/lite/g3doc/api_docs/python/index.md b/tensorflow/contrib/lite/g3doc/api_docs/python/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..70031a3c3d26eb6557014879cc92288cd22331eb
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/api_docs/python/index.md
@@ -0,0 +1,10 @@
+Project: /mobile/_project.yaml
+Book: /mobile/_book.yaml
+page_type: reference
+<style> table img { max-width: 100%; } </style>
+<script src="/_static/js/managed/mathjax/MathJax.js?config=TeX-AMS-MML_SVG"></script>
+
+<!-- DO NOT EDIT! Automatically generated file. -->
+# All symbols in TensorFlow Lite
+
+TEMP PAGE
diff --git a/tensorflow/contrib/lite/g3doc/apis.md b/tensorflow/contrib/lite/g3doc/apis.md
index a591a353dd8f0ac94ecaa3f12e1aa1c57566ef69..776803da8c7126c6198e3740448888119df030b9 100644
--- a/tensorflow/contrib/lite/g3doc/apis.md
+++ b/tensorflow/contrib/lite/g3doc/apis.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite APIs
 
 TensorFlow Lite provides programming APIs in C++ and Java, and in both cases
@@ -53,6 +56,7 @@ typedef enum {
 ```
 
 Failures can be easily verified with:
+
 ```c++
 if (status != kTfLiteOk) {
   // ... error handling here ...
diff --git a/tensorflow/contrib/lite/g3doc/benchmarks.md b/tensorflow/contrib/lite/g3doc/benchmarks.md
deleted file mode 100644
index 96536cba271922f1bec51f915c2996c8d9de3b9b..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/lite/g3doc/benchmarks.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Performance Benchmark numbers
-
-This document contains the performance benchmark numbers for running a few well
-known models on some Android and iOS devices.
-
-The benchmark numbers were generated by running the [TFLite benchmark
-binary](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark)
-on Android and running the [iOS benchmark
-app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios)
-on iOS.
-
-# Android benchmarks
-
-When running Android benchmarks, the CPU affinity is set to use big cores on the
-device to reduce variance (see
-[details](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#reducing-variance-between-runs-on-android)).
-
-Models are assumed to have been downloaded from the link, unzipped and pushed to
-`/data/local/tmp/tflite_models` folder. The benchmark binary is built according
-to instructions listed
-[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark#on-android)
-and is assumed to have been pushed to `/data/local/tmp`.
-
-The following command was used to run the benchmark:
-
-```
-adb shell taskset ${CPU_MASK} /data/local/tmp/benchmark_model \
-  --num_threads=1 \
-  --graph=/data/local/tmp/tflite_models/${GRAPH} \
-  --warmup_runs=1 \
-  --num_runs=50 \
-  --use_nnapi=false
-```
-
-where `${GRAPH}` is the name of model and `${CPU_MASK}` is the CPU affinity
-chosen according to the following table:
-
-Device | CPU_MASK |
--------| ----------
-Pixel 2 | f0 |
-Pixel xl | 0c |
-
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>166.5 ms (2.6 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>122.9 ms (1.8 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>69.5 ms (0.9 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>78.9 ms (2.2 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>273.8 ms (3.5 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>210.8 ms (4.2 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>234.0 ms (2.1 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>158.0 ms (2.1 ms)</td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>2846.0 ms (15.0 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>1973.0 ms (15.0 ms)  </td>
-  </tr>
-  <tr>
-    <td rowspan = 2>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>Pixel 2 </td>
-    <td>3180.0 ms (11.7 ms)</td>
-  </tr>
-   <tr>
-     <td>Pixel xl </td>
-     <td>2262.0 ms (21.0 ms)  </td>
-  </tr>
-
- </table>
-
-# iOS benchmarks
-
-For running iOS benchmarks, the [benchmark
-app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/tools/benchmark/ios)
-was modified to include the appropriate model and `benchmark_params.json` was
-modified  to set `num_threads` to 1.
-
-<table>
-  <thead>
-    <tr>
-      <th>Model Name</th>
-      <th>Device </th>
-      <th>Mean inference time (std dev)</th>
-    </tr>
-  </thead>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz">Mobilenet_1.0_224(float)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>32.2 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz)">Mobilenet_1.0_224 (quant)</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>24.4 ms (0.8 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz">NASNet mobile</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>60.3 ms (0.6 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz">SqueezeNet</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>44.3 (0.7 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz">Inception_ResNet_V2</a>
-    </td>
-    <td>iPhone 8</td>
-    <td>562.4 ms (18.2 ms)</td>
-  </tr>
-  <tr>
-    <td>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz">Inception_V4</a>
-    </td>
-    <td>iPhone 8 </td>
-    <td>661.0 ms (29.2 ms)</td>
-  </tr>
- </table>
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index 972e57f73e82961ebc5e341dd7a41bc00acc5d21..d979353bb3550fe53d86b2e6c76702a3970b01fe 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # How to use custom operators
 
 TensorFlow Lite currently supports a subset of TensorFlow operators. However, it
@@ -89,3 +92,83 @@ builtins.AddCustom("Sin", Register_SIN());
 
 Note that a similar process as above can be followed for supporting for a set of
 operations instead of a single operator.
+
+## Best Practices for writing custom operators
+
+1.  Optimize memory allocations and de-allocations cautiously. It is more
+    efficient to allocate memory in Prepare() instead of Invoke(), and allocate
+    memory before a loop instead of in every iteration. Use temporary tensors
+    data rather than mallocing yourself (see item 2). Use pointers/references
+    instead of copying as much as possible.
+
+2.  If a data structure will persist during the entire operation, we advise
+    pre-allocating the memory using temporary tensors. You may need to use
+    OpData struct to reference the tensor indices in other functions. See
+    example in the
+    [kernel for convolution](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/kernels/conv.cc).
+    A sample code snippet is below
+
+    ```
+    auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(1);
+    node->temporaries->data[0] = op_data->temp_tensor_index;
+    TfLiteTensor* temp_tensor = &context->tensors[op_data->temp_tensor_index];
+    temp_tensor->type =  kTfLiteFloat32;
+    temp_tensor->allocation_type = kTfLiteArenaRw;
+    ```
+
+3.  If it doesn't cost too much wasted memory, prefer using a static fixed size
+    array (or in Resize() pre-allocated std::vector) rather than using a
+    dynamically allocating std::vector every iteration of execution.
+
+4.  Avoid instantiating standard library container templates that don't already
+    exist, because they affect binary size. For example, if you need a std::map
+    in your operation that doesn't exist in other kernels, using a std::vector
+    with direct indexing mapping could work while keeping the binary size small.
+    See what other kernels use to gain insight (or ask).
+
+5.  Check the pointer to the memory returned by malloc. If this pointer is
+    nullptr, no operations should be performed using that pointer. If you
+    malloc() in a function and have an error exit, deallocate memory before you
+    exit.
+
+6.  Use TF_LITE_ENSURE(context, condition) to check for a specific condition.
+    Your code must not leave memory hanging when TF_LITE_ENSURE is done, i.e.,
+    these should be done before any resources are allocated that will leak.
+
+## Special TF Graph Attributes
+
+When Toco convertes a TF graph into TFLite format, it makes some assumption
+about custom operations that might be not correct. In this case, the generated
+graph can be not executable.
+
+It is possible to add aditional information about your custom op output to TF
+graph before it is converted. The following attributes are supported:
+
+-   **_output_quantized** a boolean attribute, true if the operation outputs are
+    quantized
+-   **_output_types** a list of types for output tensors
+-   **_output_shapes** a list of shapes for output tensors
+
+### Setting the Attributes
+
+This is an example how the attributes can be set:
+
+```python
+frozen_graph_def = tf.graph_util.convert_variables_to_constants(...)
+for node in frozen_graph_def.node:
+    if node.op == 'sin':
+      node.attr['_output_types'].list.type.extend([
+          types_pb2.DT_FLOAT,
+      ])
+      node.attr['_output_shapes'].list.shape.extend([
+          tf.TensorShape([10]),
+      ])
+      node.attr['_output_quantized'].b = False
+tflite_model = tf.contrib.lite.toco_convert(
+        frozen_graph_def,...)
+```
+
+**Note:** After the attributes are set, the graph can not be executed by
+Tensorflow, therefore it should be done just before the conversion.
diff --git a/tensorflow/docs_src/mobile/tflite/demo_android.md b/tensorflow/contrib/lite/g3doc/demo_android.md
similarity index 98%
rename from tensorflow/docs_src/mobile/tflite/demo_android.md
rename to tensorflow/contrib/lite/g3doc/demo_android.md
index fdf0bcf3c1135f0e702c7dda4d1d608a26169470..d79a2696b4e9cc10480aa67c7eaec5a356eff596 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_android.md
+++ b/tensorflow/contrib/lite/g3doc/demo_android.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Android Demo App
 
 An example Android application using TensorFLow Lite is available
diff --git a/tensorflow/docs_src/mobile/tflite/demo_ios.md b/tensorflow/contrib/lite/g3doc/demo_ios.md
similarity index 97%
rename from tensorflow/docs_src/mobile/tflite/demo_ios.md
rename to tensorflow/contrib/lite/g3doc/demo_ios.md
index 3be21da89f9e53d324c2ade0cb937f4b5b30fad4..a554898899e67a6bc2bc52733f5301767bc1c06a 100644
--- a/tensorflow/docs_src/mobile/tflite/demo_ios.md
+++ b/tensorflow/contrib/lite/g3doc/demo_ios.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # iOS Demo App
 
 The TensorFlow Lite demo is a camera app that continuously classifies whatever
diff --git a/tensorflow/docs_src/mobile/tflite/devguide.md b/tensorflow/contrib/lite/g3doc/devguide.md
similarity index 91%
rename from tensorflow/docs_src/mobile/tflite/devguide.md
rename to tensorflow/contrib/lite/g3doc/devguide.md
index b168d6c18366708ebaa7216481d262b02051168d..dc9cc98c0821edff57cb9428a50637a15211cfda 100644
--- a/tensorflow/docs_src/mobile/tflite/devguide.md
+++ b/tensorflow/contrib/lite/g3doc/devguide.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Developer Guide
 
 Using a TensorFlow Lite model in your mobile app requires multiple
@@ -56,7 +59,7 @@ both floating point and quantized inference.
 A developer may choose to train a custom model using Tensorflow (see the
 [TensorFlow tutorials](../../tutorials/) for examples of building and training
 models). If you have already written a model, the first step is to export this
-to a @{tf.GraphDef} file. This is required because some formats do not store the
+to a `tf.GraphDef` file. This is required because some formats do not store the
 model structure outside the code, and we must communicate with other parts of the
 framework. See
 [Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
@@ -71,12 +74,12 @@ grow in future Tensorflow Lite releases.
 ## 2. Convert the model format
 
 The model generated (or downloaded) in the previous step is a *standard*
-Tensorflow model and you should now have a .pb or .pbtxt @{tf.GraphDef} file.
+Tensorflow model and you should now have a .pb or .pbtxt `tf.GraphDef` file.
 Models generated with transfer learning (re-training) or custom models must be
 converted—but, we must first freeze the graph to convert the model to the
 Tensorflow Lite format. This process uses several model formats:
 
-* @{tf.GraphDef} (.pb) —A protobuf that represents the TensorFlow training or
+* `tf.GraphDef` (.pb) —A protobuf that represents the TensorFlow training or
   computation graph. It contains operators, tensors, and variables definitions.
 * *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
   does not contain a graph structure, it cannot be interpreted by itself.
@@ -143,11 +146,11 @@ containing the model architecture. The [frozen_graph.pb](https://storage.googlea
 file used here is available for download. `output_file` is where the TensorFlow
 Lite model will get generated. The `input_type` and `inference_type`
 arguments should be set to `FLOAT`, unless converting a
-@{$performance/quantization$quantized model}. Setting the `input_array`,
-`output_array`, and `input_shape` arguments are not as straightforward. The
-easiest way to find these values is to explore the graph using Tensorboard. Reuse
-the arguments for specifying the output nodes for inference in the
-`freeze_graph` step.
+<a href="https://www.tensorflow.org/performance/quantization">quantized model</a>.
+Setting the `input_array`, `output_array`, and `input_shape` arguments are not as
+straightforward. The easiest way to find these values is to explore the graph
+using Tensorboard. Reuse the arguments for specifying the output nodes for
+inference in the `freeze_graph` step.
 
 It is also possible to use the Tensorflow Optimizing Converter with protobufs
 from either Python or from the command line (see the 
@@ -204,16 +207,16 @@ The open source Android demo app uses the JNI interface and is available
 [on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/java/demo/app).
 You can also download a
 [prebuilt APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk).
-See the @{$tflite/demo_android} guide for details.
+See the <a href="../demo_android.md">Android demo</a> guide for details.
 
-The @{$mobile/android_build} guide has instructions for installing TensorFlow on
-Android and setting up `bazel` and Android Studio.
+The <a href="./android_build.md">Android mobile</a> guide has instructions for
+installing TensorFlow on Android and setting up `bazel` and Android Studio.
 
 ### iOS
 
 To integrate a TensorFlow model in an iOS app, see the
 [TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite/g3doc/ios.md)
-guide and @{$tflite/demo_ios} guide.
+guide and <a href="../demo_ios.md">iOS demo</a> guide.
 
 #### Core ML support
 
diff --git a/tensorflow/contrib/lite/g3doc/ios.md b/tensorflow/contrib/lite/g3doc/ios.md
index e0358a444d6dffc377bf13ee72ba5477359d6e07..d78d373ccfea074872773693c562253b202a646b 100644
--- a/tensorflow/contrib/lite/g3doc/ios.md
+++ b/tensorflow/contrib/lite/g3doc/ios.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite for iOS
 
 ## Building
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index 4e7d33a1b63d5a84e7ad2d0505ec8b9a4a887baa..3292aece0e76244a61613b514457edf479858fdb 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # List of Hosted Models
 
 ## Image classification (Float Models)
diff --git a/tensorflow/contrib/lite/g3doc/ops_versioning.md b/tensorflow/contrib/lite/g3doc/ops_versioning.md
index bd2f797e6c5b05f52bec9fc34f1b8011aca70330..b06f4fd3b893e5e5977f92de26109a6dd264531f 100644
--- a/tensorflow/contrib/lite/g3doc/ops_versioning.md
+++ b/tensorflow/contrib/lite/g3doc/ops_versioning.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite Ops Versioning
 
 This document describes TensorFlow Lite's op versioning schema. Op
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/contrib/lite/g3doc/overview.md
similarity index 99%
rename from tensorflow/docs_src/mobile/tflite/index.md
rename to tensorflow/contrib/lite/g3doc/overview.md
index cc4af2a87514fc3b0303347bdab6f30cbc22ad55..be60d7941ade824ee201bfd05400fb3e4e9fae7e 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/contrib/lite/g3doc/overview.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Introduction to TensorFlow Lite
 
 TensorFlow Lite is TensorFlow’s lightweight solution for mobile and embedded
diff --git a/tensorflow/docs_src/mobile/tflite/performance.md b/tensorflow/contrib/lite/g3doc/performance.md
similarity index 98%
rename from tensorflow/docs_src/mobile/tflite/performance.md
rename to tensorflow/contrib/lite/g3doc/performance.md
index 79bacaaa1b889a8711e5c09c7fd4e4912e70d3bd..613e9f97c38942f20d3ca44cdc69e72b35c8608f 100644
--- a/tensorflow/docs_src/mobile/tflite/performance.md
+++ b/tensorflow/contrib/lite/g3doc/performance.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Performance
 
 This document lists TensorFlow Lite performance benchmarks when running well
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index ab50789307414255bccd84d4cfcb6ddecc25ba08..cdc9172d873bfd32811ca69901ed2e4eedf902a3 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite for Raspberry Pi
 
 ## Cross compiling
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 49d00a66ba7d7e37a672de250a98f877e0cae75f..2ea7aeaa5d93b7cee260eeb72ccf9688b8d2c4c4 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -59,6 +62,7 @@ counterparts:
 *   [tf.nn.softmax](https://www.tensorflow.org/api_docs/python/tf/nn/softmax) -
     *as long as tensors are 2D and axis is the last dimension*
 *   [tf.nn.top_k](https://www.tensorflow.org/api_docs/python/tf/nn/top_k)
+*   [tf.one_hot](https://www.tensorflow.org/api_docs/python/tf/one_hot)
 *   [tf.pad](https://www.tensorflow.org/api_docs/python/tf/pad) - *as long as
     mode and constant_values are not used*
 *   [tf.reduce_mean](https://www.tensorflow.org/api_docs/python/tf/reduce_mean) -
@@ -815,6 +819,18 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
similarity index 97%
rename from tensorflow/docs_src/mobile/android_build.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
index f4b07db4591dddcfbf3633f471072f4a0eea9843..76e16fc9db27782fe0f9454ba463722f4bf6eb4b 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/android_build.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Building TensorFlow on Android
 
 To get you started working with TensorFlow on Android, we'll walk through two
@@ -91,7 +94,8 @@ using [ADB](https://developer.android.com/studio/command-line/adb.html). This
 requires some knowledge of build systems and Android developer tools, but we'll
 guide you through the basics here.
 
-- First, follow our instructions for @{$install/install_sources$installing from sources}.
+- First, follow our instructions for
+  <a href="http://www.tensorflow.org/install/install_sources">installing from sources</a>.
   This will also guide you through installing Bazel and cloning the
   TensorFlow code.
 
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
similarity index 86%
rename from tensorflow/docs_src/mobile/mobile_intro.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/index.md
index baad4433083d18a19ea3dd5ec0c1bae498ac2da9..bd047bfceceddfd0b5a9fd0c83cb47a339299abf 100644
--- a/tensorflow/docs_src/mobile/mobile_intro.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/index.md
@@ -1,4 +1,45 @@
-# Introduction to TensorFlow Mobile
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
+# Overview
+
+TensorFlow was designed to be a good deep learning solution for mobile
+platforms. Currently we have two solutions for deploying machine learning
+applications on mobile and embedded devices: TensorFlow for Mobile and
+<a href="../index.md">TensorFlow Lite</a>.
+
+## TensorFlow Lite versus TensorFlow Mobile
+
+Here are a few of the differences between the two:
+
+- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
+  developed with TensorFlow Lite will have a smaller binary size, fewer
+  dependencies, and better performance.
+
+- TensorFlow Lite is in developer preview, so not all use cases are covered yet.
+  We expect you to use TensorFlow Mobile to cover production cases.
+
+- TensorFlow Lite supports only a limited set of operators, so not all models
+  will work on it by default. TensorFlow for Mobile has a fuller set of
+  supported functionality.
+
+TensorFlow Lite provides better performance and a small binary size on mobile
+platforms as well as the ability to leverage hardware acceleration if available
+on their platforms. In addition, it has many fewer dependencies so it can be
+built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
+also allows targeting accelerators through the [Neural Networks
+API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
+
+TensorFlow Lite currently has coverage for a limited set of operators. While
+TensorFlow for Mobile supports only a constrained set of ops by default, in
+principle if you use an arbitrary operator in TensorFlow, it can be customized
+to build that kernel. Thus use cases which are not currently supported by
+TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
+evolves, it will gain additional operators, and the decision will be easier to
+make.
+
+
+## Introduction to TensorFlow Mobile
 
 TensorFlow was designed from the ground up to be a good deep learning solution
 for mobile platforms like Android and iOS. This mobile guide should help you
@@ -167,7 +208,7 @@ interesting products possible.
 
 TensorFlow runs on Ubuntu Linux, Windows 10, and OS X. For a list of all
 supported operating systems and instructions to install TensorFlow, see
-@{$install$Installing Tensorflow}.
+<a href="https://www.tensorflow.org/install">Installing Tensorflow</a>.
 
 Note that some of the sample code we provide for mobile TensorFlow requires you
 to compile TensorFlow from source, so you’ll need more than just `pip install`
@@ -241,8 +282,3 @@ results you’ll see. It’s common for an algorithm to get great training accur
 numbers but then fail to be useful within a real application because there’s a
 mismatch between the dataset and real usage. Prototype end-to-end usage as soon
 as possible to create a consistent user experience.
-
-## Next Steps
-
-We suggest you get started by building one of our demos for
-@{$mobile/android_build$Android} or @{$mobile/ios_build$iOS}.
diff --git a/tensorflow/docs_src/mobile/ios_build.md b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
similarity index 98%
rename from tensorflow/docs_src/mobile/ios_build.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
index 4c84a1214a26eeb90c1b6a186a369212377b06cd..6223707892ce7b288ecabf932b33cd39860446a6 100644
--- a/tensorflow/docs_src/mobile/ios_build.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/ios_build.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Building TensorFlow on iOS
 
 ## Using CocoaPods
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
similarity index 83%
rename from tensorflow/docs_src/mobile/linking_libs.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
index efef5dd0daa0b267d8384d32d62d9ce0226dc102..4c2071ed053125cfa643ed785fe302198f734ead 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/linking_libs.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Integrating TensorFlow libraries
 
 Once you have made some progress on a model that addresses the problem you’re
@@ -14,11 +17,11 @@ TensorFlow mobile demo apps.
 
 After you've managed to build the examples, you'll probably want to call
 TensorFlow from one of your existing applications. The very easiest way to do
-this is to use the Pod installation steps described
-@{$mobile/ios_build#using_cocoapods$here}, but if you want to build TensorFlow
-from source (for example to customize which operators are included) you'll need
-to break out TensorFlow as a framework, include the right header files, and link
-against the built libraries and dependencies.
+this is to use the Pod installation steps described in
+<a href="./ios_build.md">Building TensorFlow on iOS</a>, but if you want to build
+TensorFlow from source (for example to customize which operators are included)
+you'll need to break out TensorFlow as a framework, include the right header
+files, and link against the built libraries and dependencies.
 
 ### Android
 
@@ -82,10 +85,12 @@ recompile of the core.
 To achieve this capability, TensorFlow uses a registration pattern in a lot of
 places. In the code, it looks like this:
 
-    class MulKernel : OpKernel {
-      Status Compute(OpKernelContext* context) { … }
-    };
-    REGISTER_KERNEL(MulKernel, “Mul”);
+```
+class MulKernel : OpKernel {
+	Status Compute(OpKernelContext* context) { … }
+};
+REGISTER_KERNEL(MulKernel, “Mul”);
+```
 
 This would be in a standalone `.cc` file linked into your application, either
 as part of the main set of kernels or as a separate custom library. The magic
@@ -101,15 +106,17 @@ doesn’t offer a good mechanism for doing this sort of registration, so we have
 to resort to some tricky code. Under the hood, the macro is implemented so that
 it produces something like this:
 
-    class RegisterMul {
-     public:
-      RegisterMul() {
-        global_kernel_registry()->Register(“Mul”, [](){
-          return new MulKernel()
-        });
-      }
-    };
-    RegisterMul g_register_mul;
+```
+class RegisterMul {
+	public:
+		RegisterMul() {
+			global_kernel_registry()->Register(“Mul”, [](){
+				return new MulKernel()
+			});
+	}
+};
+RegisterMul g_register_mul;
+```
 
 This sets up a class `RegisterMul` with a constructor that tells the global
 kernel registry what function to call when somebody asks it how to create a
@@ -176,8 +183,10 @@ have an experimental script at [rename_protobuf.sh](https://github.com/tensorflo
 You need to run this as part of the makefile build, after you’ve downloaded all
 the dependencies:
 
-    tensorflow/contrib/makefile/download_dependencies.sh
-    tensorflow/contrib/makefile/rename_protobuf.sh
+```
+tensorflow/contrib/makefile/download_dependencies.sh
+tensorflow/contrib/makefile/rename_protobuf.sh
+```
 
 ## Calling the TensorFlow API
 
@@ -193,18 +202,20 @@ use case, while on iOS and Raspberry Pi you call directly into the C++ API.
 
 Here’s what a typical Inference Library sequence looks like on Android:
 
-    // Load the model from disk.
-    TensorFlowInferenceInterface inferenceInterface =
-    new TensorFlowInferenceInterface(assetManager, modelFilename);
+```
+// Load the model from disk.
+TensorFlowInferenceInterface inferenceInterface =
+new TensorFlowInferenceInterface(assetManager, modelFilename);
 
-    // Copy the input data into TensorFlow.
-    inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
+// Copy the input data into TensorFlow.
+inferenceInterface.feed(inputName, floatValues, 1, inputSize, inputSize, 3);
 
-    // Run the inference call.
-    inferenceInterface.run(outputNames, logStats);
+// Run the inference call.
+inferenceInterface.run(outputNames, logStats);
 
-    // Copy the output Tensor back into the output array.
-    inferenceInterface.fetch(outputName, outputs);
+// Copy the output Tensor back into the output array.
+inferenceInterface.fetch(outputName, outputs);
+```
 
 You can find the source of this code in the [Android examples](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java#L107).
 
@@ -212,27 +223,29 @@ You can find the source of this code in the [Android examples](https://github.co
 
 Here’s the equivalent code for iOS and Raspberry Pi:
 
-    // Load the model.
-    PortableReadFileToProto(file_path, &tensorflow_graph);
-
-    // Create a session from the model.
-    tensorflow::Status s = session->Create(tensorflow_graph);
-    if (!s.ok()) {
-      LOG(FATAL) << "Could not create TensorFlow Graph: " << s;
-    }
-
-    // Run the model.
-    std::string input_layer = "input";
-    std::string output_layer = "output";
-    std::vector<tensorflow::Tensor> outputs;
-    tensorflow::Status run_status = session->Run({{input_layer, image_tensor}},
+```
+// Load the model.
+PortableReadFileToProto(file_path, &tensorflow_graph);
+
+// Create a session from the model.
+tensorflow::Status s = session->Create(tensorflow_graph);
+if (!s.ok()) {
+    LOG(FATAL) << "Could not create TensorFlow Graph: " << s;
+}
+
+// Run the model.
+std::string input_layer = "input";
+std::string output_layer = "output";
+std::vector<tensorflow::Tensor> outputs;
+tensorflow::Status run_status = session->Run({\{input_layer, image_tensor}},
                                {output_layer}, {}, &outputs);
-    if (!run_status.ok()) {
-      LOG(FATAL) << "Running model failed: " << run_status;
-    }
+if (!run_status.ok()) {
+    LOG(FATAL) << "Running model failed: " << run_status;
+}
 
-    // Access the output data.
-    tensorflow::Tensor* output = &outputs[0];
+// Access the output data.
+tensorflow::Tensor* output = &outputs[0];
+```
 
 This is all based on the
 [iOS sample code](https://www.tensorflow.org/code/tensorflow/examples/ios/simple/RunModelViewController.mm),
diff --git a/tensorflow/docs_src/mobile/optimizing.md b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
similarity index 98%
rename from tensorflow/docs_src/mobile/optimizing.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
index 778e4d3a6233c3bec70b830bc998013745a1f0ba..a0192c3541483437b817e22eb92193bd7bcb4c28 100644
--- a/tensorflow/docs_src/mobile/optimizing.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/optimizing.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Optimizing for mobile
 
 There are some special issues that you have to deal with when you’re trying to
@@ -77,7 +80,7 @@ out of a mobile device's memory faster.
 
 To understand how large your network will be on disk, start by looking at the
 size on disk of your `GraphDef` file after you’ve run `freeze_graph` and
-`strip_unused_nodes` on it (see @{$mobile/prepare_models$Preparing models} for
+`strip_unused_nodes` on it (see <a href="./prepare_models.md">Preparing models</a> for
 more details on these tools), since then it should only contain
 inference-related nodes. To double-check that your results are as expected, run
 the `summarize_graph` tool to see how many parameters are in constants:
@@ -103,7 +106,8 @@ you multiply the number of const parameters by four, you should get something
 that’s close to the size of the file on disk. You can often get away with only
 eight-bits per parameter with very little loss of accuracy in the final result,
 so if your file size is too large you can try using
-@{$performance/quantization$quantize_weights} to transform the parameters down.
+<a href="https://www.tensorflow.org/performance/quantization">quantize_weights</a>
+to transform the parameters down.
 
     bazel build tensorflow/tools/graph_transforms:transform_graph && \
     bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
@@ -292,7 +296,8 @@ run it on a 64-bit ARM device:
 
 You can interpret the results in exactly the same way as the desktop version
 above. If you have any trouble figuring out what the right input and output
-names and types are, take a look at the @{$mobile/prepare_models$Preparing models}
+names and types are, take a look at the
+<a href="./prepare_models">Preparing models</a>
 page for details about detecting these for your model, and look at the
 `summarize_graph` tool which may give you
 helpful information.
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
similarity index 98%
rename from tensorflow/docs_src/mobile/prepare_models.md
rename to tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
index 2b84dbb97388b16c6a4ae1d3472e0b1a993285f0..6b4e4a92bd9262139be3cf650b7d16714ee3a277 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/contrib/lite/g3doc/tfmobile/prepare_models.md
@@ -1,3 +1,6 @@
+book_path: /mobile/_book.yaml
+project_path: /mobile/_project.yaml
+
 # Preparing models for mobile deployment
 
 The requirements for storing model information during training are very
@@ -255,8 +258,8 @@ The criteria for including ops and types fall into several categories:
 These ops are trimmed by default to optimize for inference on mobile, but it is
 possible to alter some build files to change the default.  After alternating the
 build files, you will need to recompile TensorFlow.  See below for more details
-on how to do this, and also see @{$mobile/optimizing#binary_size$Optimizing} for
-more on reducing your binary size.
+on how to do this, and also see <a href="./optimizing.md">optimizing binary size</a>
+for more on reducing your binary size.
 
 ### Locate the implementation
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 26fecceab028c7fb86edc3f5e0a6d9d1e5556ffd..e38597495dc7e860209026631c2d386f690b6461 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -43,10 +43,13 @@ namespace {
 TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
                            const TfLiteRegistration& registration,
                            int node_index, const char* message) {
-  context->ReportError(context, "Node number %d (%s) %s.\n", node_index,
-                       EnumNameBuiltinOperator(static_cast<BuiltinOperator>(
-                           registration.builtin_code)),
-                       message);
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
   return kTfLiteError;
 }
 
@@ -131,9 +134,7 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   context_.SetExternalContext = SetExternalContext;
 
   // Invalid to call these these except from TfLiteDelegate
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+  SwitchToKernelContext();
 
   // Reserve some space for the tensors to avoid excessive resizing.
   tensors_.reserve(kTensorsReservedCapacity);
@@ -278,8 +279,9 @@ TfLiteStatus Interpreter::ReplaceSubgraphsWithDelegateKernels(
         int node_index;
 
         TfLiteDelegateParams* params = CreateDelegateParams(delegate, subgraph);
-        AddNodeWithParameters(subgraph.input_tensors, subgraph.output_tensors,
-                              nullptr, 0, params, &registration, &node_index);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            subgraph.input_tensors, subgraph.output_tensors, nullptr, 0, params,
+            &registration, &node_index));
 
         // Initialize the output tensors's delegate-related fields.
         for (int tensor_index : subgraph.output_tensors) {
@@ -924,6 +926,19 @@ void Interpreter::SetNumThreads(int num_threads) {
   }
 }
 
+void Interpreter::SwitchToDelegateContext() {
+  context_.GetNodeAndRegistration = GetNodeAndRegistration;
+  context_.ReplaceSubgraphsWithDelegateKernels =
+      ReplaceSubgraphsWithDelegateKernels;
+  context_.GetExecutionPlan = GetExecutionPlan;
+}
+
+void Interpreter::SwitchToKernelContext() {
+  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
+  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
+  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+}
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
                                                   bool allow_dynamic_tensors) {
   if (!allow_dynamic_tensors) {
@@ -950,17 +965,12 @@ TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate,
 
   // TODO(aselle): Consider if it is worth storing pointers to delegates.
   // Setup additional context interface.
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceSubgraphsWithDelegateKernels =
-      ReplaceSubgraphsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
+  SwitchToDelegateContext();
 
   TfLiteStatus status = delegate->Prepare(&context_, delegate);
 
   // Remove additional context info.
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceSubgraphsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+  SwitchToKernelContext();
 
   TF_LITE_ENSURE_OK(&context_, status);
 
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index bc608e2fcecfd0fa6f03841542f06911e3f88f29..be149a8cc0e642d10b270ba617cd8d6be29430b2 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -111,7 +111,7 @@ class Interpreter {
   // processing this model will be forwarded to the error_reporter object.
   //
   // Note, if error_reporter is nullptr, then a default StderrReporter is
-  // used.
+  // used. Ownership of 'error_reporter' remains with the caller.
   explicit Interpreter(ErrorReporter* error_reporter = DefaultErrorReporter());
 
   ~Interpreter();
@@ -416,6 +416,13 @@ class Interpreter {
  private:
   friend class InterpreterTest;
 
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
   // Give 'op_reg' a chance to initialize itself using the contents of
   // 'buffer'.
   void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
@@ -502,6 +509,7 @@ class Interpreter {
   // Update the execution graph to replace some of the nodes with stub
   // nodes. Specifically any node index that has `nodes[index]==1` will be
   // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
   // WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus ReplaceSubgraphsWithDelegateKernels(
       TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
diff --git a/tensorflow/contrib/lite/interpreter_test.cc b/tensorflow/contrib/lite/interpreter_test.cc
index 10119903fed448bd44f408efb495831216dc594c..2bf598bad71b87afaa22c1eb95474c49386c122f 100644
--- a/tensorflow/contrib/lite/interpreter_test.cc
+++ b/tensorflow/contrib/lite/interpreter_test.cc
@@ -647,18 +647,6 @@ TEST(BasicInterpreter, AllocateTwice) {
   ASSERT_EQ(old_tensor1_ptr, interpreter.tensor(1)->data.raw);
 }
 
-struct TestErrorReporter : public ErrorReporter {
-  int Report(const char* format, va_list args) override {
-    char buffer[1024];
-    int size = vsnprintf(buffer, sizeof(buffer), format, args);
-    all_reports += buffer;
-    calls++;
-    return size;
-  }
-  int calls = 0;
-  std::string all_reports;
-};
-
 TEST(BasicInterpreter, TestNullErrorReporter) {
   TestErrorReporter reporter;
   Interpreter interpreter;
@@ -668,8 +656,9 @@ TEST(BasicInterpreter, TestCustomErrorReporter) {
   TestErrorReporter reporter;
   Interpreter interpreter(&reporter);
   ASSERT_NE(interpreter.Invoke(), kTfLiteOk);
-  ASSERT_EQ(reporter.all_reports, "Invoke called on model that is not ready.");
-  ASSERT_EQ(reporter.calls, 1);
+  ASSERT_EQ(reporter.error_messages(),
+            "Invoke called on model that is not ready.");
+  ASSERT_EQ(reporter.num_calls(), 1);
 }
 
 TEST(BasicInterpreter, TestUnsupportedDelegateFunctions) {
diff --git a/tensorflow/contrib/lite/java/AndroidManifest.xml b/tensorflow/contrib/lite/java/AndroidManifest.xml
index f705feacbec38ab5152ce52b701320d8f1cd8d3d..b91c6d149a213926be90b9b131bd632d4f79a0fc 100644
--- a/tensorflow/contrib/lite/java/AndroidManifest.xml
+++ b/tensorflow/contrib/lite/java/AndroidManifest.xml
@@ -1,7 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <manifest xmlns:android="http://schemas.android.com/apk/res/android"
-          package="org.tensorflow.lite">
-    <application>
-    </application>
+    package="org.tensorflow.lite">
+
+    <uses-sdk
+        android:minSdkVersion="4"
+        android:targetSdkVersion="19" />
+
+    <application />
+
 </manifest>
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index e2c1edd9afbac9ca75262e1e41f7cf6334564dec..fdcf00a0a08459d8d669f1def3ae2eb21dbd31c3 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -152,10 +152,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_allocateTensors(
   if (error_reporter == nullptr) return;
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter:"
-                   " %s",
-                   error_reporter->CachedErrorMessage());
+    throwException(
+        env, kIllegalStateException,
+        "Internal error: Unexpected failure when preparing tensor allocations:"
+        " %s",
+        error_reporter->CachedErrorMessage());
   }
 }
 
@@ -336,10 +337,11 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   // allocates memory
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
-    throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter:"
-                   " %s",
-                   error_reporter->CachedErrorMessage());
+    throwException(
+        env, kIllegalStateException,
+        "Internal error: Unexpected failure when preparing tensor allocations:"
+        " %s",
+        error_reporter->CachedErrorMessage());
     return 0;
   }
   return reinterpret_cast<jlong>(interpreter.release());
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index ad30624f400d07c7622316e4c01a75d67340ddc5..026ab4de03af26a1cdbde64f79c11a29fa5483f3 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -58,6 +58,7 @@ cc_library(
     }),
     deps = [
         ":op_macros",
+        "//tensorflow/contrib/lite:arena_planner",
         "//tensorflow/contrib/lite:context",
         "//tensorflow/contrib/lite/kernels/internal:optimized",
     ],
@@ -175,6 +176,8 @@ cc_library(
         "mfcc.cc",
         "mul.cc",
         "neg.cc",
+        "one_hot.cc",
+        "pack.cc",
         "pad.cc",
         "pooling.cc",
         "pow.cc",
@@ -1155,6 +1158,33 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "pack_test",
+    size = "small",
+    srcs = ["pack_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "one_hot_test",
+    size = "small",
+    srcs = ["one_hot_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 99f81c4a8a78ab0b2a24955d77f25ed09da13b84..6e13b8c667c5c5188c9e1bc753346f231ae8e1b0 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -186,8 +185,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
-  TF_LITE_ENSURE(context,
-                 NumDimensions(input) == 2 || NumDimensions(input) == 4);
+  const int num_dims = NumDimensions(input);
+  TF_LITE_ENSURE(context, num_dims == 1 || num_dims == 2 || num_dims == 4);
 
   if (input->type == kTfLiteUInt8) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -365,13 +364,9 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-// Takes a 2D tensor and perform softmax along the second dimension.
-void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
-                    TfLiteSoftmaxParams* params) {
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  float* in = input->data.f;
-  float* out = output->data.f;
+// Performs softmax along the input of size (input_size * batch_size).
+void Softmax(const float* in, const int input_size, const int batch_size,
+             const float beta, float* out) {
   TF_LITE_ASSERT(input_size > 0);
 
   // For each batch
@@ -385,7 +380,7 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
     // Compute the normalized sum of exps.
     float exp_sum = 0.0;
     for (int i = 0; i < input_size; i++) {
-      out[i] = std::exp((in[i] - max_coeff) * params->beta);
+      out[i] = std::exp((in[i] - max_coeff) * beta);
       exp_sum += out[i];
     }
 
@@ -401,6 +396,33 @@ void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
   }
 }
 
+// Takes a 1D tensor and performs softmax along it.
+void Softmax1DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int input_size = input->dims->data[0];
+  Softmax(input->data.f, input_size, 1, params->beta, output->data.f);
+}
+
+// Takes a 2D tensor and perform softmax along the last dimension.
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
+                    TfLiteSoftmaxParams* params) {
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  Softmax(input->data.f, input_size, batch_size, params->beta, output->data.f);
+}
+
+void Softmax1DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
+                        TfLiteSoftmaxParams* params, OpData* data) {
+  // TODO(ahentz): this is arguably a dirty trick. Since the implementation
+  // always traverses the last dimension of a 4D tensor, we will pretend our 1D
+  // tensor is 4D in a special way. We will convert a (Y) shape into a (1,
+  // 1, 1, Y) shape.
+  const int input_size = input->dims->data[0];
+  optimized_ops::Softmax(
+      GetTensorData<uint8_t>(input), GetTensorShape({1, 1, 1, input_size}),
+      data->input_multiplier, data->input_left_shift, data->diff_min,
+      GetTensorData<uint8_t>(output), GetTensorShape({1, 1, 1, input_size}));
+}
 void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
@@ -443,6 +465,10 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   // dimensions.
   switch (input->type) {
     case kTfLiteFloat32: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DFloat(input, output, params);
+        return kTfLiteOk;
+      }
       if (NumDimensions(input) == 2) {
         Softmax2DFloat(input, output, params);
         return kTfLiteOk;
@@ -452,11 +478,15 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteOk;
       }
       context->ReportError(
-          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          context, "Only 1D, 2D and 4D tensors supported currently, got %dD.",
           NumDimensions(input));
       return kTfLiteError;
     }
     case kTfLiteUInt8: {
+      if (NumDimensions(input) == 1) {
+        Softmax1DQuantized(input, output, params, data);
+        return kTfLiteOk;
+      }
       if (NumDimensions(input) == 2) {
         Softmax2DQuantized(input, output, params, data);
         return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/activations_test.cc b/tensorflow/contrib/lite/kernels/activations_test.cc
index 587e1303da6afed1fc711100f457f1bf62b0b7e1..083cdf78d76991b89c4c2caf03dcb6db404a2578 100644
--- a/tensorflow/contrib/lite/kernels/activations_test.cc
+++ b/tensorflow/contrib/lite/kernels/activations_test.cc
@@ -339,6 +339,29 @@ TEST(QuantizedActivationsOpTest, Softmax4D) {
                   kQuantizedTolerance)));
 }
 
+TEST(FloatActivationsOpTest, Softmax1D) {
+  FloatActivationsOpModel m(0.1,
+                            /*input=*/{TensorType_FLOAT32, {8}});
+  m.SetInput({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray(ArrayFloatNear(
+          {.09752, .05352, .11911, .14548, .13164, .07984, .26509, .10778})));
+}
+
+TEST(QuantizedActivationsOpTest, Softmax1D) {
+  QuantizedActivationsOpModel m(0.1,
+                                /*input=*/{TensorType_UINT8, {8}, -10, 10});
+  m.SetInput<uint8_t>({0, -6, 2, 4, 3, -2, 10, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear({0.09766, 0.05469, 0.12109, 0.14453,
+                                       0.13281, 0.07813, 0.26563, 0.10938},
+                                      kQuantizedTolerance)));
+}
+
 TEST(FloatActivationsOpTest, Softmax2D) {
   FloatActivationsOpModel m(0.1,
                             /*input=*/{TensorType_FLOAT32, {2, 4}});
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index f44d531cbfa9ed41f881380752558555aab97b4d..af9b5c7013afc5d32d01cba07492a282727b3e12 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -110,15 +110,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     QuantizeMultiplierSmallerThanOneExp(
         real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
-    data->input1_shift *= -1;
 
     QuantizeMultiplierSmallerThanOneExp(
         real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
-    data->input2_shift *= -1;
 
     QuantizeMultiplierSmallerThanOneExp(
         real_output_multiplier, &data->output_multiplier, &data->output_shift);
-    data->output_shift *= -1;
 
     CalculateActivationRangeUint8(params->activation, output,
                                   &data->output_activation_min,
@@ -152,14 +149,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
         CheckedLog2(output->params.scale, &output_scale_log2_rounded);
     TF_LITE_ENSURE(context, output_scale_is_pot);
 
-    data->input1_shift = output_scale_log2_rounded - input1_scale_log2_rounded;
-    data->input2_shift = output_scale_log2_rounded - input2_scale_log2_rounded;
+    data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
+    data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
 
     // Shifting of one input is supported. The graph quantization should ensure
     // that the other input matches the output.
     TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
-    TF_LITE_ENSURE(context, data->input1_shift >= 0);
-    TF_LITE_ENSURE(context, data->input2_shift >= 0);
+    TF_LITE_ENSURE(context, data->input1_shift <= 0);
+    TF_LITE_ENSURE(context, data->input2_shift <= 0);
 
     CalculateActivationRangeQuantized(context, params->activation, output,
                                       &data->output_activation_min,
@@ -173,24 +170,27 @@ template <KernelType kernel_type>
 void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
              const OpData* data, const TfLiteTensor* input1,
              const TfLiteTensor* input2, TfLiteTensor* output) {
-#define TF_LITE_ADD(type, opname, data_type)                            \
-  data_type output_activation_min, output_activation_max;               \
-  CalculateActivationRange(params->activation, &output_activation_min,  \
-                           &output_activation_max);                     \
-  type::opname(GetTensorData<data_type>(input1), GetTensorDims(input1), \
-               GetTensorData<data_type>(input2), GetTensorDims(input2), \
-               output_activation_min, output_activation_max,            \
-               GetTensorData<data_type>(output), GetTensorDims(output))
+#define TF_LITE_ADD(type, opname, data_type)                             \
+  data_type output_activation_min, output_activation_max;                \
+  CalculateActivationRange(params->activation, &output_activation_min,   \
+                           &output_activation_max);                      \
+  tflite::ArithmeticParams op_params;                                    \
+  SetActivationParams(output_activation_min, output_activation_max,      \
+                      &op_params);                                       \
+  type::opname(op_params, GetTensorShape(input1),                        \
+               GetTensorData<data_type>(input1), GetTensorShape(input2), \
+               GetTensorData<data_type>(input2), GetTensorShape(output), \
+               GetTensorData<data_type>(output))
   if (output->type == kTfLiteInt32) {
     if (kernel_type == kReference) {
       if (data->requires_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd, int32_t);
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
       } else {
         TF_LITE_ADD(reference_ops, Add, int32_t);
       }
     } else {
       if (data->requires_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd, int32_t);
+        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
       } else {
         TF_LITE_ADD(optimized_ops, Add, int32_t);
       }
@@ -198,13 +198,13 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   } else if (output->type == kTfLiteFloat32) {
     if (kernel_type == kReference) {
       if (data->requires_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd, float);
+        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
       } else {
         TF_LITE_ADD(reference_ops, Add, float);
       }
     } else {
       if (data->requires_broadcast) {
-        TF_LITE_ADD(optimized_ops, BroadcastAdd, float);
+        TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, float);
       } else {
         TF_LITE_ADD(optimized_ops, Add, float);
       }
@@ -220,30 +220,43 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                               const TfLiteTensor* input2,
                               TfLiteTensor* output) {
   if (output->type == kTfLiteUInt8) {
-#define TF_LITE_ADD(type, opname)                                              \
-  type::opname(                                                                \
-      data->left_shift, GetTensorData<uint8_t>(input1), GetTensorDims(input1), \
-      data->input1_offset, data->input1_multiplier, data->input1_shift,        \
-      GetTensorData<uint8_t>(input2), GetTensorDims(input2),                   \
-      data->input2_offset, data->input2_multiplier, data->input2_shift,        \
-      data->output_offset, data->output_multiplier, data->output_shift,        \
-      data->output_activation_min, data->output_activation_max,                \
-      GetTensorData<uint8_t>(output), GetTensorDims(output));
+#define TF_LITE_ADD(type, opname)                                      \
+  tflite::ArithmeticParams op_params;                                  \
+  op_params.left_shift = data->left_shift;                             \
+  op_params.input1_offset = data->input1_offset;                       \
+  op_params.input1_multiplier = data->input1_multiplier;               \
+  op_params.input1_shift = data->input1_shift;                         \
+  op_params.input2_offset = data->input2_offset;                       \
+  op_params.input2_multiplier = data->input2_multiplier;               \
+  op_params.input2_shift = data->input2_shift;                         \
+  op_params.output_offset = data->output_offset;                       \
+  op_params.output_multiplier = data->output_multiplier;               \
+  op_params.output_shift = data->output_shift;                         \
+  SetActivationParams(data->output_activation_min,                     \
+                      data->output_activation_max, &op_params);        \
+  type::opname(op_params, GetTensorShape(input1),                      \
+               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
+               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
+               GetTensorData<uint8_t>(output))
     // The quantized version of Add doesn't support activations, so we
     // always use BroadcastAdd.
     if (kernel_type == kReference) {
-      TF_LITE_ADD(reference_ops, BroadcastAdd);
+      TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow);
     } else {
-      TF_LITE_ADD(optimized_ops, BroadcastAdd);
+      TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow);
     }
 #undef TF_LITE_ADD
   } else if (output->type == kTfLiteInt16) {
-#define TF_LITE_ADD(type, opname)                                        \
-  type::opname(GetTensorData<int16_t>(input1), GetTensorDims(input1),    \
-               data->input1_shift, GetTensorData<int16_t>(input2),       \
-               GetTensorDims(input2), data->input2_shift,                \
-               data->output_activation_min, data->output_activation_max, \
-               GetTensorData<int16_t>(output), GetTensorDims(output));
+#define TF_LITE_ADD(type, opname)                                      \
+  tflite::ArithmeticParams op_params;                                  \
+  op_params.input1_shift = data->input1_shift;                         \
+  op_params.input2_shift = data->input2_shift;                         \
+  SetActivationParams(data->output_activation_min,                     \
+                      data->output_activation_max, &op_params);        \
+  type::opname(op_params, GetTensorShape(input1),                      \
+               GetTensorData<int16_t>(input1), GetTensorShape(input2), \
+               GetTensorData<int16_t>(input2), GetTensorShape(output), \
+               GetTensorData<int16_t>(output))
     // The quantized version of Add doesn't support activations, so we
     // always use BroadcastAdd.
     if (kernel_type == kReference) {
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index 14a19aeef390a71b99c4c9dff036746d102c9e9c..a11a59aa050675314ac8b1316cdd0f15c81b8b15 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
index aa24c1f34cd1e8c02a6a75b62fbe5f3c629498ca..517309a226bcfb717186be8c1d02d68e3b337f8e 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
diff --git a/tensorflow/contrib/lite/kernels/concatenation.cc b/tensorflow/contrib/lite/kernels/concatenation.cc
index 45ea8d00498455be98467f2f1addc8ad7dcf35fa..ad211e9c67eed9ca70fcdd51171fdb70bd89b27c 100644
--- a/tensorflow/contrib/lite/kernels/concatenation.cc
+++ b/tensorflow/contrib/lite/kernels/concatenation.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index a4fe9e55506bd01ce6c9142777c2ac632b37d46a..6f174763dfab9845d991b930e44b07a95e00d824 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <algorithm>
 #include <cassert>
 #include <cmath>
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index 16e5f1d065d8ea6d187c5e368d6c9385fe62514b..21518156b851892f50c62df7901d71c41fd733f7 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/eigen_support.cc b/tensorflow/contrib/lite/kernels/eigen_support.cc
index 4f0d020793eb4c62dfd1e02af883e10adbfab436..e542ad076528fa30152abba074a5c7dcd6ca1f48 100644
--- a/tensorflow/contrib/lite/kernels/eigen_support.cc
+++ b/tensorflow/contrib/lite/kernels/eigen_support.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/contrib/lite/arena_planner.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/eigen_spatial_convolutions.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
@@ -23,6 +24,16 @@ namespace tflite {
 namespace eigen_support {
 namespace {
 
+#ifndef EIGEN_DONT_ALIGN
+// Eigen may require buffers to be algiend to 16, 32 or 64 bytes depending on
+// hardware architecture and build configurations.
+// If the static assertion fails, try to increase `kDefaultTensorAlignment` to
+// in `arena_planner.h` to 32 or 64.
+static_assert(
+    kDefaultTensorAlignment % EIGEN_MAX_ALIGN_BYTES == 0,
+    "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
+#endif  // EIGEN_DONT_ALIGN
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
index f550339d03c4c774ae5bc90fd81079d12efe69ae..b2dff87e6296c6038241c704d9158e174501f026 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -29,7 +29,6 @@ limitations under the License.
 //   When indices are out of bound, the ops will not succeed.
 //
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 3b203dd480f95c5dc70a69aafce0bac6ab2cbc06..bc370608c092eeb5312dc40b56f47740f473c8ae 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -71,7 +70,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
   gemm_support::IncrementUsageCounter(context);
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   context->AddTensors(context, 1, &op_data->input_quantized_index);
   return op_data;
 }
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
index 41211d41aa85a5a2da6ae96dc6f0337c54fb1a45..f37c66acb33eb9995772e595b84df6616e8d9e6a 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -31,7 +31,6 @@ limitations under the License.
 //   Each item indicates whether the corresponding lookup has a returned value.
 //   0 for missing key, 1 for found key.
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index b86ca49c116875672c4516a2a47f7dae511a7116..310a8980e6943db3804b0671a21ccf0e6ce34c28 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -127,6 +127,139 @@ int CountLeadingZeros(T integer_input) {
   return leading_zeros;
 }
 
+// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
+// rectangular array of numbers.
+//
+// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
+// However, as Dims<N> is to be deprecated, this class exists as an adaptor
+// to enable simple unoptimized implementations of element-wise broadcasting
+// operations.
+template <int N>
+struct NdArrayDesc {
+  // The "extent" of each dimension. Indices along dimension d must be in the
+  // half-open interval [0, extents[d]).
+  int extents[N];
+
+  // The number of *elements* (not bytes) between consecutive indices of each
+  // dimension.
+  int strides[N];
+};
+
+// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
+// BROADCASTING.
+//
+// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
+inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
+                            int i3) {
+  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
+  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
+  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
+  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
+  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
+         i3 * desc.strides[3];
+}
+
+// Given the dimensions of the operands for an element-wise binary broadcast,
+// adjusts them so that they can be directly iterated over with simple loops.
+// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
+// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
+//
+// This function assumes that the two input shapes are compatible up to
+// broadcasting and the shorter one has already been prepended with 1s to be the
+// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
+// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
+// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
+// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
+//
+// When two shapes are compatible up to broadcasting, for each dimension d,
+// the input extents are either equal, or one of them is 1.
+//
+// This function performs the following for each dimension d:
+// - If the extents are equal, then do nothing since the loop that walks over
+//   both of the input arrays is correct.
+// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
+//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
+//   array0 to be referenced *at any index* in dimension d and still access the
+//   same slice.
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
+                                                const Dims<N>& input1_dims,
+                                                NdArrayDesc<N>* desc0_out,
+                                                NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  // Copy dims to desc.
+  for (int i = 0; i < N; ++i) {
+    desc0_out->extents[i] = input0_dims.sizes[i];
+    desc0_out->strides[i] = input0_dims.strides[i];
+    desc1_out->extents[i] = input1_dims.sizes[i];
+    desc1_out->strides[i] = input1_dims.strides[i];
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = ArraySize(input0_dims, i);
+    const int extent1 = ArraySize(input1_dims, i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
+template <int N>
+inline void NdArrayDescsForElementwiseBroadcast(
+    const RuntimeShape& input0_shape, const RuntimeShape& input1_shape,
+    NdArrayDesc<N>* desc0_out, NdArrayDesc<N>* desc1_out) {
+  TFLITE_DCHECK(desc0_out != nullptr);
+  TFLITE_DCHECK(desc1_out != nullptr);
+
+  auto extended_input0_shape = RuntimeShape::ExtendedShape(N, input0_shape);
+  auto extended_input1_shape = RuntimeShape::ExtendedShape(N, input1_shape);
+
+  // Copy dims to desc, calculating strides.
+  int desc0_stride = 1;
+  int desc1_stride = 1;
+  for (int i = N - 1; i >= 0; --i) {
+    desc0_out->extents[i] = extended_input0_shape.Dims(i);
+    desc0_out->strides[i] = desc0_stride;
+    desc0_stride *= extended_input0_shape.Dims(i);
+    desc1_out->extents[i] = extended_input1_shape.Dims(i);
+    desc1_out->strides[i] = desc1_stride;
+    desc1_stride *= extended_input1_shape.Dims(i);
+  }
+
+  // Walk over each dimension. If the extents are equal do nothing.
+  // Otherwise, set the desc with extent 1 to have extent equal to the other and
+  // stride 0.
+  for (int i = 0; i < N; ++i) {
+    const int extent0 = extended_input0_shape.Dims(i);
+    const int extent1 = extended_input1_shape.Dims(i);
+    if (extent0 != extent1) {
+      if (extent0 == 1) {
+        desc0_out->strides[i] = 0;
+        desc0_out->extents[i] = extent1;
+      } else {
+        TFLITE_DCHECK_EQ(extent1, 1);
+        desc1_out->strides[i] = 0;
+        desc1_out->extents[i] = extent0;
+      }
+    }
+  }
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
index 6db41d796156827567adb3ba98698ebfd27b7dc4..d5503073a7cfc0be137fde104815ca1a2a6bb438 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/legacy_optimized_ops.h
@@ -55,6 +55,245 @@ inline void Relu(const float* input_data, const Dims<4>& input_dims,
        DimsToShape(output_dims));
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<int32>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int32>::max();
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  tflite::ArithmeticParams op_params;
+  op_params.broadcast_category =
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.broadcast_shape[4] = y0;
+  op_params.broadcast_shape[3] = y1;
+  op_params.broadcast_shape[2] = y2;
+  op_params.broadcast_shape[1] = y3;
+  op_params.broadcast_shape[0] = y4;
+  BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
+                int16* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
 inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
                         int stride_width, int stride_height, int pad_width,
                         int pad_height, int kwidth, int kheight,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index 45c9f65b645616b516875b10155760d7c6ab59da..63c89d1eeef47b206fc871929f1fb1295b2f70ff 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -115,10 +115,10 @@ void ClipVector(const float* vector, int v_size, float abs_limit,
 }
 
 void SymmetricQuantizeFloats(const float* values, const int size,
-                             int8_t* quantized_values, float* min, float* max,
-                             float* scaling_factor) {
-  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min,
-                   max, scaling_factor);
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values,
+                   min_value, max_value, scaling_factor);
 }
 
 void VectorShiftLeft(float* vector, int v_size, float shift_value) {
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 2f73036e030f122f566de156e643f88f1ad6666d..78567d52eaab779c724d3e3d04fbaf92fe6e589b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -42,10 +42,12 @@ namespace optimized_ops {
 // Unoptimized reference ops:
 using reference_ops::ArgMax;
 using reference_ops::ArgMinMax;
+using reference_ops::BroadcastAdd4DSlow;
 using reference_ops::BroadcastGreater;
 using reference_ops::BroadcastGreaterEqual;
 using reference_ops::BroadcastLess;
 using reference_ops::BroadcastLessEqual;
+using reference_ops::BroadcastSub4DSlow;
 using reference_ops::Concatenation;
 using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
@@ -217,98 +219,6 @@ SaturatingRoundingMultiplyByPOTParam(
       SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }
 
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template <int N>
-struct NdArrayDesc {
-  // The "extent" of each dimension. Indices along dimension d must be in the
-  // half-open interval [0, extents[d]).
-  int extents[N];
-
-  // The number of *elements* (not bytes) between consecutive indices of each
-  // dimension.
-  int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// ELEMENT-WISE BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
-                            int i3) {
-  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
-  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
-  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
-  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
-  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
-         i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-//   both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-//   array0 to be referenced *at any index* in dimension d and still access the
-//   same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
-                                                const Dims<N>& input1_dims,
-                                                NdArrayDesc<N>* desc0_out,
-                                                NdArrayDesc<N>* desc1_out) {
-  TFLITE_DCHECK(desc0_out != nullptr);
-  TFLITE_DCHECK(desc1_out != nullptr);
-
-  // Copy dims to desc.
-  for (int i = 0; i < N; ++i) {
-    desc0_out->extents[i] = input0_dims.sizes[i];
-    desc0_out->strides[i] = input0_dims.strides[i];
-    desc1_out->extents[i] = input1_dims.sizes[i];
-    desc1_out->strides[i] = input1_dims.strides[i];
-  }
-
-  // Walk over each dimension. If the extents are equal do nothing.
-  // Otherwise, set the desc with extent 1 to have extent equal to the other and
-  // stride 0.
-  for (int i = 0; i < N; ++i) {
-    const int extent0 = ArraySize(input0_dims, i);
-    const int extent1 = ArraySize(input1_dims, i);
-    if (extent0 != extent1) {
-      if (extent0 == 1) {
-        desc0_out->strides[i] = 0;
-        desc0_out->extents[i] = extent1;
-      } else {
-        TFLITE_DCHECK_EQ(extent1, 1);
-        desc1_out->strides[i] = 0;
-        desc1_out->extents[i] = extent0;
-      }
-    }
-  }
-}
-
 inline bool AreSameDims(const Dims<4>& dims1, const Dims<4>& dims2) {
   for (int i = 0; i < 4; i++) {
     if (dims1.sizes[i] != dims2.sizes[i]) {
@@ -2478,20 +2388,17 @@ inline void L2Normalization(const uint8* input_data,
   }
 }
 
-inline void Add(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
   gemmlowp::ScopedProfilingLabel label("Add");
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
   int i = 0;
-  const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
 #ifdef USE_NEON
-  const auto activation_min = vdupq_n_f32(output_activation_min);
-  const auto activation_max = vdupq_n_f32(output_activation_max);
+  const auto activation_min = vdupq_n_f32(params.float_activation_min);
+  const auto activation_max = vdupq_n_f32(params.float_activation_max);
   for (; i <= size - 16; i += 16) {
     auto a10 = vld1q_f32(input1_data + i);
     auto a11 = vld1q_f32(input1_data + i + 4);
@@ -2530,29 +2437,26 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
 
   for (; i < size; i++) {
     auto x = input1_data[i] + input2_data[i];
-    output_data[i] = ActivationFunctionWithMinMax(x, output_activation_min,
-                                                  output_activation_max);
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
   }
 }
 
 // Element-wise add that can often be used for inner loop of broadcast add as
 // well as the non-broadcast add.
-inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
-                           int32 input1_offset, int32 input1_multiplier,
-                           int input1_shift, const uint8* input2_data,
-                           int32 input2_offset, int32 input2_multiplier,
-                           int input2_shift, int32 output_offset,
-                           int32 output_multiplier, int output_shift,
-                           int32 output_activation_min,
-                           int32 output_activation_max, uint8* output_data) {
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
   int i = 0;
-  TFLITE_DCHECK_GT(input1_offset, -256);
-  TFLITE_DCHECK_GT(input2_offset, -256);
-  TFLITE_DCHECK_LT(input1_offset, 256);
-  TFLITE_DCHECK_LT(input2_offset, 256);
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
 #ifdef USE_NEON
-  const auto output_activation_min_vector = vdup_n_u8(output_activation_min);
-  const auto output_activation_max_vector = vdup_n_u8(output_activation_max);
+  const auto output_activation_min_vector =
+      vdup_n_u8(params.quantized_activation_min);
+  const auto output_activation_max_vector =
+      vdup_n_u8(params.quantized_activation_max);
   for (; i <= size - 8; i += 8) {
     const auto input1_val_original = vld1_u8(input1_data + i);
     const auto input2_val_original = vld1_u8(input2_data + i);
@@ -2561,9 +2465,9 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
     const auto input2_val_s16 =
         vreinterpretq_s16_u16(vmovl_u8(input2_val_original));
     const auto input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(input1_offset));
+        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
     const auto input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(input2_offset));
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
     const auto input1_val_high = vget_high_s16(input1_val);
     const auto input1_val_low = vget_low_s16(input1_val);
     const auto input2_val_high = vget_high_s16(input2_val);
@@ -2572,32 +2476,32 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
     auto x12 = vmovl_s16(input1_val_high);
     auto x21 = vmovl_s16(input2_val_low);
     auto x22 = vmovl_s16(input2_val_high);
-    const auto left_shift_dup = vdupq_n_s32(left_shift);
+    const auto left_shift_dup = vdupq_n_s32(params.left_shift);
     x11 = vshlq_s32(x11, left_shift_dup);
     x12 = vshlq_s32(x12, left_shift_dup);
     x21 = vshlq_s32(x21, left_shift_dup);
     x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, input2_multiplier);
-    const auto input1_shift_dup = vdupq_n_s32(-input1_shift);
-    const auto input2_shift_dup = vdupq_n_s32(-input2_shift);
+    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const auto input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const auto input2_shift_dup = vdupq_n_s32(params.input2_shift);
     x11 = vshlq_s32(x11, input1_shift_dup);
     x12 = vshlq_s32(x12, input1_shift_dup);
     x21 = vshlq_s32(x21, input2_shift_dup);
     x22 = vshlq_s32(x22, input2_shift_dup);
     auto s1 = vaddq_s32(x11, x21);
     auto s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, output_multiplier);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, output_shift);
-    s2 = RoundingDivideByPOT(s2, output_shift);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
     const auto s1_narrowed = vmovn_s32(s1);
     const auto s2_narrowed = vmovn_s32(s2);
     const auto s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                             vdupq_n_s16(output_offset));
+                             vdupq_n_s16(params.output_offset));
     const auto clamped =
         vmax_u8(output_activation_min_vector,
                 vmin_u8(output_activation_max_vector, vqmovun_s16(s)));
@@ -2606,101 +2510,74 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
 #endif  // NEON
 
   for (; i < size; ++i) {
-    const int32 input1_val = input1_offset + input1_data[i];
-    const int32 input2_val = input2_offset + input2_data[i];
-    const int32 shifted_input1_val = input1_val * (1 << left_shift);
-    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
     const int32 scaled_input1_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input1_val, input1_multiplier,
-            kReverseShift * input1_shift);
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
     const int32 scaled_input2_val =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            shifted_input2_val, input2_multiplier,
-            kReverseShift * input2_shift);
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
     const int32 raw_output =
         MultiplyByQuantizedMultiplierSmallerThanOneExp(
-            raw_sum, output_multiplier, kReverseShift * output_shift) +
-        output_offset;
-    const int32 clamped_output = std::min(
-        output_activation_max, std::max(output_activation_min, raw_output));
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
     output_data[i] = static_cast<uint8>(clamped_output);
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
-                const Dims<4>& input1_dims, int32 input1_offset,
-                int32 input1_multiplier, int input1_shift,
-                const uint8* input2_data, const Dims<4>& input2_dims,
-                int32 input2_offset, int32 input2_multiplier, int input2_shift,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
   gemmlowp::ScopedProfilingLabel label("Add/8bit");
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
-  TFLITE_DCHECK_GT(input1_offset, -256);
-  TFLITE_DCHECK_GT(input2_offset, -256);
-  TFLITE_DCHECK_LT(input1_offset, 256);
-  TFLITE_DCHECK_LT(input2_offset, 256);
-  AddElementwise(flat_size, left_shift, input1_data, input1_offset,
-                 input1_multiplier, input1_shift, input2_data, input2_offset,
-                 input2_multiplier, input2_shift, output_offset,
-                 output_multiplier, output_shift, output_activation_min,
-                 output_activation_max, output_data);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
-                int input1_shift, const int16* input2_data,
-                const Dims<4>& input2_dims, int input2_shift,
-                int16 output_activation_min, int16 output_activation_max,
-                int16* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data) {
   gemmlowp::ScopedProfilingLabel label("Add/Int16");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
 
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingFlatSize(output_shape, input1_shape, input2_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
 
-  TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
-  TFLITE_DCHECK_GE(input1_shift, 0);
-  TFLITE_DCHECK_GE(input2_shift, 0);
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
-  const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
 
     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
-    F0 scaled_input =
-        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
     const int16 raw_output = result.raw();
     const int16 clamped_output = std::min(
@@ -2709,195 +2586,59 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-inline void Add(const int32* input1_data, const Dims<4>& input1_dims,
-                const int32* input2_data, const Dims<4>& input2_dims,
-                int32 output_activation_min, int32 output_activation_max,
-                int32* output_data, const Dims<4>& output_dims) {
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int32* input1_data,
+                const RuntimeShape& input2_shape, const int32* input2_data,
+                const RuntimeShape& output_shape, int32* output_data) {
   gemmlowp::ScopedProfilingLabel label("Add/int32");
 
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
-  for (int i = 0; i < flat_size; ++i) {
-    output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] + input2_data[i], output_activation_min,
-        output_activation_max);
-  }
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
-                int input1_shift, const int16* input2_data,
-                const Dims<4>& input2_dims, int input2_shift,
-                int16 output_activation_min, int16 output_activation_max,
-                int16* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, -32768);
-    TFLITE_DCHECK_EQ(output_activation_max, 32767);
-  }
-
-  Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims,
-      input2_shift, output_activation_min, output_activation_max, output_data,
-      output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-void Add(const int32* input1_data, const Dims<4>& input1_dims,
-         const int32* input2_data, const Dims<4>& input2_dims,
-         int32* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Add/int32");
-  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
-
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto input2_map = MapAsVector(input2_data, input2_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  if (AreSameDims(input1_dims, input2_dims)) {
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  if (input1_shape == input2_shape) {
     output_map.array() = input1_map.array() + input2_map.array();
-  } else if (FlatSize(input2_dims) == 1) {
+  } else if (input2_shape.FlatSize() == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() + scalar;
-  } else if (FlatSize(input1_dims) == 1) {
+  } else if (input1_shape.FlatSize() == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar + input2_map.array();
   } else {
     // Should not come here.
     TFLITE_DCHECK(false);
   }
+  output_map = output_map.cwiseMax(params.quantized_activation_min);
+  output_map = output_map.cwiseMin(params.quantized_activation_max);
 }
 
-// TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
-}
-
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier,
-                  kReverseShift * input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier,
-                  kReverseShift * input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, output_multiplier, kReverseShift * output_shift) +
-              output_offset;
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
-  }
-}
-
-inline void BroadcastAddFivefold(
-    int y0, int y1, int y2, int y3, int y4, int left_shift,
-    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
-    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
-    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
-    int input2_shift, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_data, const Dims<4>& output_dims) {
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
   gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
 
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
   // Fivefold nested loops. The second input resets its position for each
   // iteration of the second loop. The first input resets its position at the
   // beginning of the fourth loop. The innermost loop is an elementwise add of
@@ -2905,82 +2646,29 @@ inline void BroadcastAddFivefold(
   uint8* output_data_ptr = output_data;
   const uint8* input1_data_ptr = input1_data;
   const uint8* input2_data_reset = input2_data;
-  for (int i4 = 0; i4 < y4; ++i4) {
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
     const uint8* input2_data_ptr;
-    for (int i3 = 0; i3 < y3; ++i3) {
+    for (int i1 = 0; i1 < y1; ++i1) {
       input2_data_ptr = input2_data_reset;
       for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i1 = 0; i1 < y1; ++i1) {
-          AddElementwise(
-              y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier,
-              input1_shift, input2_data_ptr, input2_offset, input2_multiplier,
-              input2_shift, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_data_ptr);
-          input2_data_ptr += y0;
-          output_data_ptr += y0;
+        for (int i3 = 0; i3 < y3; ++i3) {
+          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                         output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
         }
-        input1_data_ptr += y0;
+        input1_data_ptr += y4;
       }
     }
     input2_data_reset = input2_data_ptr;
   }
 }
 
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
-               input1_multiplier, input1_shift, input2_data, input2_dims,
-               input2_offset, input2_multiplier, input2_shift, output_offset,
-               output_multiplier, output_shift, output_activation_min,
-               output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAddFivefold(
-    int y0, int y1, int y2, int y3, int y4, int left_shift,
-    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
-    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
-    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
-    int input2_shift, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
-                       input1_offset, input1_multiplier, input1_shift,
-                       input2_data, input2_dims, input2_offset,
-                       input2_multiplier, input2_shift, output_offset,
-                       output_multiplier, output_shift, output_activation_min,
-                       output_activation_max, output_data, output_dims);
-}
-
 inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -3305,135 +2993,78 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
 }
 
 // TODO(aselle): This is not actually optimized yet.
-inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Sub");
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const float* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const float* input2_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("SubNonBroadcast");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], output_activation_min,
-        output_activation_max);
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
   }
 }
 
-inline void Sub(const int32* input1_data, const Dims<4>& input1_dims,
-                const int32* input2_data, const Dims<4>& input2_dims,
-                int32 output_activation_min, int32 output_activation_max,
-                int32* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Sub/int32");
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const int32* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const int32* input2_data,
+                              const RuntimeShape& output_shape,
+                              int32* output_data) {
+  gemmlowp::ScopedProfilingLabel label("SubWithActivation/int32");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, input2_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], output_activation_min,
-        output_activation_max);
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
   }
 }
 
-// TODO(jiawen): We can implement BroadcastSub on buffers of arbitrary
-// dimensionality if the runtime code does a single loop over one dimension
-// that handles broadcasting as the base case. The code generator would then
-// generate max(D1, D2) nested for loops.
-// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
-// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
-// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
-// reference_ops.h.
-template <typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const float* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const float* input2_data,
+                              const RuntimeShape& output_shape,
+                              float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("SubWithActivation/float");
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
   }
 }
 
-inline void BroadcastSub(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("Sub");
 
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier,
-                  kReverseShift * input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier,
-                  kReverseShift * input2_shift);
-          const int32 raw_sub = scaled_input1_val - scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sub, output_multiplier, kReverseShift * output_shift) +
-              output_offset;
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
+  auto input1_map = MapAsVector(input1_data, input1_shape);
+  auto input2_map = MapAsVector(input2_data, input2_shape);
+  auto output_map = MapAsVector(output_data, output_shape);
+  if (input1_shape == input2_shape) {
+    output_map.array() = input1_map.array() - input2_map.array();
+  } else if (input1_shape.FlatSize() == 1) {
+    auto scalar = input1_data[0];
+    output_map.array() = scalar - input2_map.array();
+  } else if (input2_shape.FlatSize() == 1) {
+    auto scalar = input2_data[0];
+    output_map.array() = input1_map.array() - scalar;
+  } else {
+    BroadcastSub4DSlow(params, input1_shape, input1_data, input2_shape,
+                       input2_data, output_shape, output_data);
   }
 }
 
@@ -5875,63 +5506,6 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-void GenericBroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                         const T* input2_data, const Dims<4>& input2_dims,
-                         T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("GenericBroadcastSub");
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
-         const Dims<4>& input2_dims, T* output_data,
-         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("Sub");
-
-  auto input1_map = MapAsVector(input1_data, input1_dims);
-  auto input2_map = MapAsVector(input2_data, input2_dims);
-  auto output_map = MapAsVector(output_data, output_dims);
-  if (AreSameDims(input1_dims, input2_dims)) {
-    output_map.array() = input1_map.array() - input2_map.array();
-  } else if (FlatSize(input1_dims) == 1) {
-    auto scalar = input1_data[0];
-    output_map.array() = scalar - input2_map.array();
-  } else if (FlatSize(input2_dims) == 1) {
-    auto scalar = input2_data[0];
-    output_map.array() = input1_map.array() - scalar;
-  } else {
-    GenericBroadcastSub(input1_data, input1_dims, input2_data, input2_dims,
-                        output_data, output_dims);
-  }
-}
-
 template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index db7926df9af955df8f9d2a87898b35e1ea2c962e..010b40b901e2821c36367da7e2c987fac113de11 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -19,6 +19,10 @@ limitations under the License.
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 #ifndef USE_NEON
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #define USE_NEON
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
index f715d34bc1aab4fe483dc712018b5016ffef25f5..bcf5e4e4f6593ec9bce7acd1fb7082955276ca32 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/legacy_reference_ops.h
@@ -63,6 +63,240 @@ inline void Relu6(const float* input_data, const Dims<4>& input_dims,
         DimsToShape(output_dims));
 }
 
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const int32* input1_data, const Dims<4>& input1_dims,
+         const int32* input2_data, const Dims<4>& input2_dims,
+         int32* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("Add/int32");
+  TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
+
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<int32>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int32>::max();
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAdd(int left_shift, const uint8* input1_data,
+                         const Dims<4>& input1_dims, int32 input1_offset,
+                         int32 input1_multiplier, int input1_shift,
+                         const uint8* input2_data, const Dims<4>& input2_dims,
+                         int32 input2_offset, int32 input2_multiplier,
+                         int input2_shift, int32 output_offset,
+                         int32 output_multiplier, int output_shift,
+                         int32 output_activation_min,
+                         int32 output_activation_max, uint8* output_data,
+                         const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T output_activation_min, T output_activation_max,
+                  T* output_data, const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  BroadcastAdd4DSlow(op_params, DimsToShape(input1_dims), input1_data,
+                     DimsToShape(input2_dims), input2_data,
+                     DimsToShape(output_dims), output_data);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  constexpr int kReverseShift = -1;
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  tflite::ArithmeticParams op_params;
+  op_params.broadcast_category =
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+  op_params.left_shift = left_shift;
+  op_params.input1_offset = input1_offset;
+  op_params.input1_multiplier = input1_multiplier;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_offset = input2_offset;
+  op_params.input2_multiplier = input2_multiplier;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = output_multiplier;
+  op_params.output_shift = kReverseShift * output_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  op_params.broadcast_shape[4] = y0;
+  op_params.broadcast_shape[3] = y1;
+  op_params.broadcast_shape[2] = y2;
+  op_params.broadcast_shape[1] = y3;
+  op_params.broadcast_shape[0] = y4;
+  BroadcastAddFivefold(op_params, DimsToShape(input1_dims), input1_data,
+                       DimsToShape(input2_dims), input2_data,
+                       DimsToShape(output_dims), output_data);
+}
+
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac, typename T>
+void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
+                  const T* input2_data, const Dims<4>& input2_dims,
+                  T* output_data, const Dims<4>& output_dims) {
+  T output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
+               output_activation_min, output_activation_max, output_data,
+               output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
+                int input1_shift, const int16* input2_data,
+                const Dims<4>& input2_dims, int input2_shift,
+                int16 output_activation_min, int16 output_activation_max,
+                int16* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, -32768);
+    TFLITE_DCHECK_EQ(output_activation_max, 32767);
+  }
+
+  tflite::ArithmeticParams op_params;
+  op_params.input1_shift = kReverseShift * input1_shift;
+  op_params.input2_shift = kReverseShift * input2_shift;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+  Add(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(FusedActivationFunctionType::kNone,
+                      &output_activation_min, &output_activation_max);
+  tflite::ArithmeticParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
+template <typename T>
+void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
+         const Dims<4>& input2_dims, T* output_data,
+         const Dims<4>& output_dims) {
+  tflite::ArithmeticParams op_params;
+  op_params.quantized_activation_min = std::numeric_limits<T>::min();
+  op_params.quantized_activation_max = std::numeric_limits<T>::max();
+  Sub(op_params, DimsToShape(input1_dims), input1_data,
+      DimsToShape(input2_dims), input2_data, DimsToShape(output_dims),
+      output_data);
+}
+
 inline void AveragePool(const float* input_data, const Dims<4>& input_dims,
                         int stride_width, int stride_height, int pad_width,
                         int pad_height, int kwidth, int kheight,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
index 7ead449ca84ec2e5bbff9c2be6c6b1c6fe3b018c..6bd88b5596bc0f7c425745012b7b4a091b64afbb 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <stdlib.h>
 #include <string.h>
+#include <algorithm>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
@@ -38,14 +39,14 @@ bool PortableIsZeroVector(const float* vector, int v_size) {
 
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values,
-                                     float* __restrict__ min,
-                                     float* __restrict__ max,
+                                     float* __restrict__ min_value,
+                                     float* __restrict__ max_value,
                                      float* __restrict__ scaling_factor) {
   auto minmax = std::minmax_element(values, values + size);
-  *min = *minmax.first;
-  *max = *minmax.second;
+  *min_value = *minmax.first;
+  *max_value = *minmax.second;
   const int kScale = 127;
-  const float range = std::max(std::abs(*min), std::abs(*max));
+  const float range = std::max(std::abs(*min_value), std::abs(*max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
     *scaling_factor = 1;
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index d3a4fa8507c19b2ced513a115148dc9f3fda3714..a375aaffa67ac19975cc8e371be11547d689dc72 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -19,6 +19,10 @@ limitations under the License.
 // structure.
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 namespace tflite {
 namespace tensor_utils {
 
@@ -28,8 +32,8 @@ float PortableClip(float f, float abs_limit);
 bool PortableIsZeroVector(const float* vector, int v_size);
 
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
-                                     int8_t* quantized_values, float* min,
-                                     float* max, float* scaling_factor);
+                                     int8_t* quantized_values, float* min_value,
+                                     float* max_value, float* scaling_factor);
 
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 04f61c7434dcbad8d051e3addbcc849d59bdc32d..ce16394082afe6b7474a643469a71934db7dac46 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -158,98 +158,6 @@ SaturatingRoundingMultiplyByPOTParam(
       SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
 }
 
-// DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
-// BROADCASTING.
-//
-// NdArrayDesc<N> describes the shape and memory layout of an N-dimensional
-// rectangular array of numbers.
-//
-// NdArrayDesc<N> is basically identical to Dims<N> defined in types.h.
-// However, as Dims<N> is to be deprecated, this class exists as an adaptor
-// to enable simple unoptimized implementations of element-wise broadcasting
-// operations.
-template <int N>
-struct NdArrayDesc {
-  // The "extent" of each dimension. Indices along dimension d must be in the
-  // half-open interval [0, extents[d]).
-  int extents[N];
-
-  // The number of *elements* (not bytes) between consecutive indices of each
-  // dimension.
-  int strides[N];
-};
-
-// DO NOT USE THIS FUNCTION FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
-// ELEMENT-WISE BROADCASTING.
-//
-// Same as Offset(), except takes as NdArrayDesc<N> instead of Dims<N>.
-inline int SubscriptToIndex(const NdArrayDesc<4>& desc, int i0, int i1, int i2,
-                            int i3) {
-  TFLITE_DCHECK(i0 >= 0 && i0 < desc.extents[0]);
-  TFLITE_DCHECK(i1 >= 0 && i1 < desc.extents[1]);
-  TFLITE_DCHECK(i2 >= 0 && i2 < desc.extents[2]);
-  TFLITE_DCHECK(i3 >= 0 && i3 < desc.extents[3]);
-  return i0 * desc.strides[0] + i1 * desc.strides[1] + i2 * desc.strides[2] +
-         i3 * desc.strides[3];
-}
-
-// Given the dimensions of the operands for an element-wise binary broadcast,
-// adjusts them so that they can be directly iterated over with simple loops.
-// Returns the adjusted dims as instances of NdArrayDesc in 'desc0_out' and
-// 'desc1_out'. 'desc0_out' and 'desc1_out' cannot be nullptr.
-//
-// This function assumes that the two input shapes are compatible up to
-// broadcasting and the shorter one has already been prepended with 1s to be the
-// same length. E.g., if shape0 is (1, 16, 16, 64) and shape1 is (1, 64),
-// shape1 must already have been prepended to be (1, 1, 1, 64). Recall that
-// Dims<N> refer to shapes in reverse order. In this case, input0_dims will be
-// (64, 16, 16, 1) and input1_dims will be (64, 1, 1, 1).
-//
-// When two shapes are compatible up to broadcasting, for each dimension d,
-// the input extents are either equal, or one of them is 1.
-//
-// This function performs the following for each dimension d:
-// - If the extents are equal, then do nothing since the loop that walks over
-//   both of the input arrays is correct.
-// - Otherwise, one (and only one) of the extents must be 1. Say extent0 is 1
-//   and extent1 is e1. Then set extent0 to e1 and stride0 *to 0*. This allows
-//   array0 to be referenced *at any index* in dimension d and still access the
-//   same slice.
-template <int N>
-inline void NdArrayDescsForElementwiseBroadcast(const Dims<N>& input0_dims,
-                                                const Dims<N>& input1_dims,
-                                                NdArrayDesc<N>* desc0_out,
-                                                NdArrayDesc<N>* desc1_out) {
-  TFLITE_DCHECK(desc0_out != nullptr);
-  TFLITE_DCHECK(desc1_out != nullptr);
-
-  // Copy dims to desc.
-  for (int i = 0; i < N; ++i) {
-    desc0_out->extents[i] = input0_dims.sizes[i];
-    desc0_out->strides[i] = input0_dims.strides[i];
-    desc1_out->extents[i] = input1_dims.sizes[i];
-    desc1_out->strides[i] = input1_dims.strides[i];
-  }
-
-  // Walk over each dimension. If the extents are equal do nothing.
-  // Otherwise, set the desc with extent 1 to have extent equal to the other and
-  // stride 0.
-  for (int i = 0; i < N; ++i) {
-    const int extent0 = ArraySize(input0_dims, i);
-    const int extent1 = ArraySize(input1_dims, i);
-    if (extent0 != extent1) {
-      if (extent0 == 1) {
-        desc0_out->strides[i] = 0;
-        desc0_out->extents[i] = extent1;
-      } else {
-        TFLITE_DCHECK_EQ(extent1, 1);
-        desc1_out->strides[i] = 0;
-        desc1_out->extents[i] = extent0;
-      }
-    }
-  }
-}
-
 inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  const float* filter_data, const Dims<4>& filter_dims,
                  const float* bias_data, const Dims<4>& bias_dims,
@@ -1065,114 +973,108 @@ inline void L2Normalization(const uint8* input_data,
 }
 
 template <typename T>
-inline void Add(const T* input1_data, const Dims<4>& input1_dims,
-                const T* input2_data, const Dims<4>& input2_dims,
-                T output_activation_min, T output_activation_max,
-                T* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const T* input1_data,
+                const RuntimeShape& input2_shape, const T* input2_data,
+                const RuntimeShape& output_shape, T* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] + input2_data[i], output_activation_min,
-        output_activation_max);
+        input1_data[i] + input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const float* input1_data,
+                const RuntimeShape& input2_shape, const float* input2_data,
+                const RuntimeShape& output_shape, float* output_data) {
+  const int size = MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < size; i++) {
+    auto x = input1_data[i] + input2_data[i];
+    output_data[i] = ActivationFunctionWithMinMax(
+        x, params.float_activation_min, params.float_activation_max);
+  }
 }
 
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
-                const Dims<4>& input1_dims, int32 input1_offset,
-                int32 input1_multiplier, int input1_shift,
-                const uint8* input2_data, const Dims<4>& input2_dims,
-                int32 input2_offset, int32 input2_multiplier, int input2_shift,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const int32 input1_val =
-              input1_offset + input1_data[Offset(input1_dims, c, x, y, b)];
-          const int32 input2_val =
-              input2_offset + input2_data[Offset(input2_dims, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
-          const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier,
-                  kReverseShift * input1_shift);
-          const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier,
-                  kReverseShift * input2_shift);
-          const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-          const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, output_multiplier, kReverseShift * output_shift) +
-              output_offset;
-          const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
-              static_cast<uint8>(clamped_output);
-        }
-      }
-    }
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, const ArithmeticParams& params,
+                           const uint8* input1_data, const uint8* input2_data,
+                           uint8* output_data) {
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+
+  for (int i = 0; i < size; ++i) {
+    const int32 input1_val = params.input1_offset + input1_data[i];
+    const int32 input2_val = params.input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, params.output_multiplier, params.output_shift) +
+        params.output_offset;
+    const int32 clamped_output =
+        std::min(params.quantized_activation_max,
+                 std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<uint8>(clamped_output);
   }
 }
 
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
-                int input1_shift, const int16* input2_data,
-                const Dims<4>& input2_dims, int input2_shift,
-                int16 output_activation_min, int16 output_activation_max,
-                int16* output_data, const Dims<4>& output_dims) {
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const uint8* input1_data,
+                const RuntimeShape& input2_shape, const uint8* input2_data,
+                const RuntimeShape& output_shape, uint8* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
 
-  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
+  TFLITE_DCHECK_GT(params.input1_offset, -256);
+  TFLITE_DCHECK_GT(params.input2_offset, -256);
+  TFLITE_DCHECK_LT(params.input1_offset, 256);
+  TFLITE_DCHECK_LT(params.input2_offset, 256);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
+}
 
-  TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
-  TFLITE_DCHECK_GE(input1_shift, 0);
-  TFLITE_DCHECK_GE(input2_shift, 0);
+inline void Add(const ArithmeticParams& params,
+                const RuntimeShape& input1_shape, const int16* input1_data,
+                const RuntimeShape& input2_shape, const int16* input2_data,
+                const RuntimeShape& output_shape, int16* output_data) {
+  TFLITE_DCHECK_LE(params.quantized_activation_min,
+                   params.quantized_activation_max);
+
+  const int input1_shift = params.input1_shift;
+  const int flat_size =
+      MatchingFlatSize(output_shape, input1_shape, input2_shape);
+  const int16 output_activation_min = params.quantized_activation_min;
+  const int16 output_activation_max = params.quantized_activation_max;
+
+  TFLITE_DCHECK(input1_shift == 0 || params.input2_shift == 0);
+  TFLITE_DCHECK_LE(input1_shift, 0);
+  TFLITE_DCHECK_LE(params.input2_shift, 0);
   const int16* not_shift_input = input1_shift == 0 ? input1_data : input2_data;
   const int16* shift_input = input1_shift == 0 ? input2_data : input1_data;
-  const int input_shift = input1_shift == 0 ? input2_shift : input1_shift;
+  const int input_right_shift =
+      input1_shift == 0 ? -params.input2_shift : -input1_shift;
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
     using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
 
     F0 input_ready_scaled = F0::FromRaw(not_shift_input[i]);
-    F0 scaled_input =
-        F0::FromRaw(gemmlowp::RoundingDivideByPOT(shift_input[i], input_shift));
+    F0 scaled_input = F0::FromRaw(
+        gemmlowp::RoundingDivideByPOT(shift_input[i], input_right_shift));
     F0 result = gemmlowp::SaturatingAdd(scaled_input, input_ready_scaled);
     const int16 raw_output = result.raw();
     const int16 clamped_output = std::min(
@@ -1181,42 +1083,28 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-template <FusedActivationFunctionType Ac>
-inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
-                int input1_shift, const int16* input2_data,
-                const Dims<4>& input2_dims, int input2_shift,
-                int16 output_activation_min, int16 output_activation_max,
-                int16* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, -32768);
-    TFLITE_DCHECK_EQ(output_activation_max, 32767);
-  }
-
-  Add(input1_data, input1_dims, input1_shift, input2_data, input2_dims,
-      input2_shift, output_activation_min, output_activation_max, output_data,
-      output_dims);
-}
-
 // TODO(jiawen): We can implement BroadcastAdd on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd");
-
+// TODO(benoitjacob): BroadcastAdd is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const float* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const float* input2_data,
+                               const RuntimeShape& output_shape,
+                               float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -1229,49 +1117,77 @@ void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
               ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] +
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.float_activation_min, params.float_activation_max);
         }
       }
     }
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac, typename T>
-void BroadcastAdd(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T* output_data, const Dims<4>& output_dims) {
-  T output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int32* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int32* input2_data,
+                               const RuntimeShape& output_shape,
+                               int32* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
 
-  BroadcastAdd(input1_data, input1_dims, input2_data, input2_dims,
-               output_activation_min, output_activation_max, output_data,
-               output_dims);
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] +
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
 }
 
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
-
+inline void BroadcastAdd4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -1284,33 +1200,37 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32 shifted_input2_val =
+              input2_val * (1 << params.left_shift);
           const int32 scaled_input1_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier,
-                  kReverseShift * input1_shift);
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
           const int32 scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier,
-                  kReverseShift * input2_shift);
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sum, output_multiplier, kReverseShift * output_shift) +
-              output_offset;
+                  raw_sum, params.output_multiplier, params.output_shift) +
+              params.output_offset;
           const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
               static_cast<uint8>(clamped_output);
         }
       }
@@ -1318,117 +1238,62 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
   }
 }
 
-inline void BroadcastAddFivefold(
-    int y0, int y1, int y2, int y3, int y4, int left_shift,
-    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
-    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
-    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
-    int input2_shift, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
-
-  int sb1 = y0;
-  int sa2 = y0;
-  int sb2 = y0 * y1;
-  int sa3 = y0 * y2;
-  int sa4 = y0 * y2 * y3;
-  int sb4 = y0 * y1 * y2;
-
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const uint8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const uint8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
   uint8* output_data_ptr = output_data;
-  for (int i4 = 0; i4 < y4; ++i4) {
-    for (int i3 = 0; i3 < y3; ++i3) {
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  for (int i0 = 0; i0 < y0; ++i0) {
+    const uint8* input2_data_ptr;
+    for (int i1 = 0; i1 < y1; ++i1) {
+      input2_data_ptr = input2_data_reset;
       for (int i2 = 0; i2 < y2; ++i2) {
-        for (int i1 = 0; i1 < y1; ++i1) {
-          for (int i0 = 0; i0 < y0; ++i0) {
-            const int32 input1_val =
-                input1_offset +
-                input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0];
-            const int32 input2_val =
-                input2_offset +
-                input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0];
-            const int32 shifted_input1_val = input1_val * (1 << left_shift);
-            const int32 shifted_input2_val = input2_val * (1 << left_shift);
-            const int32 scaled_input1_val =
-                MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                    shifted_input1_val, input1_multiplier,
-                    kReverseShift * input1_shift);
-            const int32 scaled_input2_val =
-                MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                    shifted_input2_val, input2_multiplier,
-                    kReverseShift * input2_shift);
-            const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-            const int32 raw_output =
-                MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                    raw_sum, output_multiplier, kReverseShift * output_shift) +
-                output_offset;
-            const int32 clamped_output =
-                std::min(output_activation_max,
-                         std::max(output_activation_min, raw_output));
-            *output_data_ptr = static_cast<uint8>(clamped_output);
-            ++output_data_ptr;
-          }
+        for (int i3 = 0; i3 < y3; ++i3) {
+          AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                         output_data_ptr);
+          input2_data_ptr += y4;
+          output_data_ptr += y4;
         }
+        input1_data_ptr += y4;
       }
     }
+    input2_data_reset = input2_data_ptr;
   }
 }
 
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAdd(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  BroadcastAdd(left_shift, input1_data, input1_dims, input1_offset,
-               input1_multiplier, input1_shift, input2_data, input2_dims,
-               input2_offset, input2_multiplier, input2_shift, output_offset,
-               output_multiplier, output_shift, output_activation_min,
-               output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void BroadcastAddFivefold(
-    int y0, int y1, int y2, int y3, int y4, int left_shift,
-    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
-    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
-    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
-    int input2_shift, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
-                       input1_offset, input1_multiplier, input1_shift,
-                       input2_data, input2_dims, input2_offset,
-                       input2_multiplier, input2_shift, output_offset,
-                       output_multiplier, output_shift, output_activation_min,
-                       output_activation_max, output_data, output_dims);
-}
-
 template <typename T>
 inline void Mul(const T* input1_data, const Dims<4>& input1_dims,
                 const T* input2_data, const Dims<4>& input2_dims,
@@ -1667,16 +1532,35 @@ inline void Div(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-template <typename T>
-inline void Sub(const T* input1_data, const Dims<4>& input1_dims,
-                const T* input2_data, const Dims<4>& input2_dims,
-                T output_activation_min, T output_activation_max,
-                T* output_data, const Dims<4>& output_dims) {
-  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const float* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const float* input2_data,
+                            const RuntimeShape& output_shape,
+                            float* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
   for (int i = 0; i < flat_size; ++i) {
     output_data[i] = ActivationFunctionWithMinMax(
-        input1_data[i] - input2_data[i], output_activation_min,
-        output_activation_max);
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
+  }
+}
+
+inline void SubNonBroadcast(const ArithmeticParams& params,
+                            const RuntimeShape& input1_shape,
+                            const int32* input1_data,
+                            const RuntimeShape& input2_shape,
+                            const int32* input2_data,
+                            const RuntimeShape& output_shape,
+                            int32* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
   }
 }
 
@@ -1684,16 +1568,24 @@ inline void Sub(const T* input1_data, const Dims<4>& input1_dims,
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
 // generate max(D1, D2) nested for loops.
-template <typename T>
-void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
-                  const T* input2_data, const Dims<4>& input2_dims,
-                  T output_activation_min, T output_activation_max,
-                  T* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastSub");
-
+// TODO(benoitjacob): BroadcastSub is intentionally duplicated from
+// reference_ops.h. Once an optimized version is implemented and NdArrayDesc<T>
+// is no longer referenced in this file, move NdArrayDesc<T> from types.h to
+// reference_ops.h.
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const float* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const float* input2_data,
+                               const RuntimeShape& output_shape,
+                               float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/float");
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -1706,36 +1598,35 @@ void BroadcastSub(const T* input1_data, const Dims<4>& input1_dims,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
               ActivationFunctionWithMinMax(
-                  input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
-                      input2_data[SubscriptToIndex(desc2, c, x, y, b)],
-                  output_activation_min, output_activation_max);
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.float_activation_min, params.float_activation_max);
         }
       }
     }
   }
 }
 
-inline void BroadcastSub(int left_shift, const uint8* input1_data,
-                         const Dims<4>& input1_dims, int32 input1_offset,
-                         int32 input1_multiplier, int input1_shift,
-                         const uint8* input2_data, const Dims<4>& input2_dims,
-                         int32 input2_offset, int32 input2_multiplier,
-                         int input2_shift, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastSub/8bit");
-
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const uint8* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const uint8* input2_data,
+                               const RuntimeShape& output_shape,
+                               uint8* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/uint8");
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
 
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
   // col, channel), with extents (batches, height, width, depth), with the
@@ -1748,33 +1639,37 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
   // We name our variables by their Tensorflow convention, but generate C code
   // nesting loops such that the innermost loop has the smallest stride for the
   // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
           const int32 input1_val =
-              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+              params.input1_offset +
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)];
           const int32 input2_val =
-              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-          const int32 shifted_input1_val = input1_val * (1 << left_shift);
-          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+              params.input2_offset +
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+          const int32 shifted_input1_val =
+              input1_val * (1 << params.left_shift);
+          const int32 shifted_input2_val =
+              input2_val * (1 << params.left_shift);
           const int32 scaled_input1_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input1_val, input1_multiplier,
-                  kReverseShift * input1_shift);
+                  shifted_input1_val, params.input1_multiplier,
+                  params.input1_shift);
           const int32 scaled_input2_val =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  shifted_input2_val, input2_multiplier,
-                  kReverseShift * input2_shift);
+                  shifted_input2_val, params.input2_multiplier,
+                  params.input2_shift);
           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
           const int32 raw_output =
               MultiplyByQuantizedMultiplierSmallerThanOneExp(
-                  raw_sub, output_multiplier, kReverseShift * output_shift) +
-              output_offset;
+                  raw_sub, params.output_multiplier, params.output_shift) +
+              params.output_offset;
           const int32 clamped_output =
-              std::min(output_activation_max,
-                       std::max(output_activation_min, raw_output));
-          output_data[Offset(output_dims, c, x, y, b)] =
+              std::min(params.quantized_activation_max,
+                       std::max(params.quantized_activation_min, raw_output));
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
               static_cast<uint8>(clamped_output);
         }
       }
@@ -1782,6 +1677,156 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
   }
 }
 
+inline void BroadcastSub4DSlow(const ArithmeticParams& params,
+                               const RuntimeShape& input1_shape,
+                               const int32* input1_data,
+                               const RuntimeShape& input2_shape,
+                               const int32* input2_data,
+                               const RuntimeShape& output_shape,
+                               int32* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/int32");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void BroadcastSub4DSlow(const ArithmeticParams& params,
+                        const RuntimeShape& input1_shape, const T* input1_data,
+                        const RuntimeShape& input2_shape, const T* input2_data,
+                        const RuntimeShape& output_shape, T* output_data) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAdd4DSlow/templated");
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              ActivationFunctionWithMinMax(
+                  input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+                      input2_data[SubscriptToIndex(desc2, b, y, x, c)],
+                  params.quantized_activation_min,
+                  params.quantized_activation_max);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Sub(const ArithmeticParams& params, const RuntimeShape& input1_shape,
+         const T* input1_data, const RuntimeShape& input2_shape,
+         const T* input2_data, const RuntimeShape& output_shape,
+         T* output_data) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1,
+                                      &desc2);
+  RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          output_data[Offset(extended_output_shape, b, y, x, c)] =
+              input1_data[SubscriptToIndex(desc1, b, y, x, c)] -
+              input2_data[SubscriptToIndex(desc2, b, y, x, c)];
+        }
+      }
+    }
+  }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const int32* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const int32* input2_data,
+                              const RuntimeShape& output_shape,
+                              int32* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.quantized_activation_min,
+        params.quantized_activation_max);
+  }
+}
+
+inline void SubWithActivation(const ArithmeticParams& params,
+                              const RuntimeShape& input1_shape,
+                              const float* input1_data,
+                              const RuntimeShape& input2_shape,
+                              const float* input2_data,
+                              const RuntimeShape& output_shape,
+                              float* output_data) {
+  const int flat_size =
+      MatchingFlatSize(input1_shape, input2_shape, input2_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], params.float_activation_min,
+        params.float_activation_max);
+  }
+}
+
 template <FusedActivationFunctionType Ac, typename Scalar>
 void Concatenation(int concat_dim, const Scalar* const* input_data,
                    const Dims<4>* const* input_dims, int inputs_count,
@@ -1815,6 +1860,26 @@ void Concatenation(int concat_dim, const Scalar* const* input_data,
   }
 }
 
+template <typename Scalar>
+void Pack(int dim, const Scalar* const* input_data,
+          const Dims<4>* const* input_dims, int inputs_count,
+          Scalar* output_data, const Dims<4>& output_dims) {
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  int outer_size = 1;
+  for (int i = dim + 1; i < 4; i++) {
+    outer_size *= output_dims.sizes[i];
+  }
+  Scalar* output_ptr = output_data;
+  const int copy_size = FlatSize(**input_dims) / outer_size;
+  for (int k = 0; k < outer_size; k++) {
+    for (int i = 0; i < inputs_count; ++i) {
+      memcpy(output_ptr, input_data[i] + k * copy_size,
+             copy_size * sizeof(Scalar));
+      output_ptr += copy_size;
+    }
+  }
+}
+
 // TODO(prabhumk): This is the same as the optimized implementation.
 // TODO(prabhumk): The quantized implementation of concatentation isn't fully
 // quantized as it takes scale as a floating point value. This should be fixed
@@ -3219,7 +3284,8 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const Dims<4>& block_shape_dims,
                            const int32* paddings_data,
                            const Dims<4>& paddings_dims, T* output_data,
-                           const Dims<4>& output_dims) {
+                           const Dims<4>& output_dims,
+                           const int32_t pad_value) {
   const int output_batch_size = ArraySize(output_dims, 3);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
@@ -3244,7 +3310,7 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                 padding_top + input_height ||
             out_w * block_shape_width + shift_w < padding_left ||
             out_w * block_shape_width + shift_w >= padding_left + input_width) {
-          memset(out, 0, depth * sizeof(T));
+          memset(out, pad_value, depth * sizeof(T));
         } else {
           const T* in =
               input_data +
@@ -3259,6 +3325,17 @@ inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
+                           const int32* block_shape_data,
+                           const Dims<4>& block_shape_dims,
+                           const int32* paddings_data,
+                           const Dims<4>& paddings_dims, T* output_data,
+                           const Dims<4>& output_dims) {
+  SpaceToBatchND(input_data, input_dims, block_shape_data, block_shape_dims,
+                 paddings_data, paddings_dims, output_data, output_dims, 0);
+}
+
 template <typename T>
 inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
@@ -3472,7 +3549,6 @@ inline bool Reduce(const In* input_data, const int* input_dims,
                    Out reducer(const Out current, const In in),
                    Out* output_data) {
   // Reset input iterator.
-  TFLITE_DCHECK(input_num_dims > 0);
   for (int idx = 0; idx < input_num_dims; ++idx) {
     input_iter[idx] = 0;
   }
@@ -3492,6 +3568,10 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
                         const int64_t num_axis, int* out_axis,
                         int* out_num_axis) {
   *out_num_axis = 0;  // Just in case.
+  // Short-circuit axis resolution for scalars; the axis will go unused.
+  if (num_dims == 0) {
+    return true;
+  }
   // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
   for (int64_t idx = 0; idx < num_axis; ++idx) {
     // Handle negative index.
@@ -3717,38 +3797,6 @@ inline void Mean(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-template <typename T>
-void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
-         const Dims<4>& input2_dims, T* output_data,
-         const Dims<4>& output_dims) {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
-    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
-      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
-        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] -
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
-        }
-      }
-    }
-  }
-}
-
 template <typename T>
 void TensorFlowMinimum(const T* input1_data, const Dims<4>& input1_dims,
                        const T* input2_data, T* output_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
index 4eddf7bf0a2cbca695dae20ba8ba56a9cd72e4ba..20abcb725859d03f83c969369bddf1429895e0ba 100644
--- a/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/internal/spectrogram.cc
@@ -43,13 +43,13 @@ bool Spectrogram::Initialize(int window_length, int step_length) {
   return Initialize(window, step_length);
 }
 
-inline int Log2Floor(uint n) {
+inline int Log2Floor(uint32_t n) {
   if (n == 0) return -1;
   int log = 0;
-  uint value = n;
+  uint32_t value = n;
   for (int i = 4; i >= 0; --i) {
     int shift = (1 << i);
-    uint x = value >> shift;
+    uint32_t x = value >> shift;
     if (x != 0) {
       value = x;
       log += shift;
@@ -58,7 +58,7 @@ inline int Log2Floor(uint n) {
   return log;
 }
 
-inline int Log2Ceiling(uint n) {
+inline int Log2Ceiling(uint32_t n) {
   int floor = Log2Floor(n);
   if (n == (n & ~(n - 1)))  // zero or a power of two
     return floor;
@@ -66,7 +66,7 @@ inline int Log2Ceiling(uint n) {
     return floor + 1;
 }
 
-inline uint NextPowerOfTwo(uint value) {
+inline uint32_t NextPowerOfTwo(uint32_t value) {
   int exponent = Log2Ceiling(value);
   // DCHECK_LT(exponent, std::numeric_limits<uint32>::digits);
   return 1 << exponent;
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 82f450312784a1864dc7732dad2a75d2d6ae90f4..1ff8cfe39c9aed7e9241815dc8eff7ab4d9fd585 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 
+#if defined(_MSC_VER)
+#define __restrict__ __restrict
+#endif
+
 namespace tflite {
 namespace tensor_utils {
 
@@ -31,8 +35,8 @@ bool IsZeroVector(const float* vector, int v_size);
 // It also outputs the range (min, max) of the floating point buffer, and the
 // scaling factor used to quantize the values.
 void SymmetricQuantizeFloats(const float* values, const int size,
-                             int8_t* quantized_values, float* min, float* max,
-                             float* scaling_factor);
+                             int8_t* quantized_values, float* min_value,
+                             float* max_value, float* scaling_factor);
 
 // Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
 // dimension composed by input vectors independent from each other). The result
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 737cfb69c9e2ac48e87a6aa77e7f51a2098f8c41..c44698b677a862bc41c947ea46fe204710b79668 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -119,6 +119,8 @@ class RuntimeShape {
   // larger shapes are separately allocated.
   static constexpr int kMaxSmallSize = 4;
 
+  RuntimeShape& operator=(RuntimeShape const&) = delete;
+
   RuntimeShape() : size_(0) {}
 
   explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
@@ -135,6 +137,20 @@ class RuntimeShape {
     BuildFrom(init_list);
   }
 
+  // Avoid using this constructor.  We should be able to delete it when C++17
+  // rolls out.
+  RuntimeShape(RuntimeShape const& other) : size_(other.DimensionsCount()) {
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_ = new int32[size_];
+    }
+    std::memcpy(DimsData(), other.DimsData(), sizeof(int32) * size_);
+  }
+
+  bool operator==(const RuntimeShape& comp) const {
+    return this->size_ == comp.size_ &&
+           std::memcmp(DimsData(), comp.DimsData(), size_ * sizeof(int32)) == 0;
+  }
+
   ~RuntimeShape() {
     if (size_ > kMaxSmallSize) {
       delete[] dims_pointer_;
@@ -191,6 +207,16 @@ class RuntimeShape {
     }
   }
 
+  // This will probably be factored out. Old code made substantial use of 4-D
+  // shapes, and so this function is used to extend smaller shapes. Note that
+  // (a) as Dims<4>-dependent code is eliminated, the reliance on this should be
+  // reduced, and (b) some kernels are stricly 4-D, but then the shapes of their
+  // inputs should already be 4-D, so this function should not be needed.
+  inline static RuntimeShape ExtendedShape(int new_shape_size,
+                                           const RuntimeShape& shape) {
+    return RuntimeShape(new_shape_size, shape, 1);
+  }
+
   inline void BuildFrom(const std::initializer_list<int> init_list) {
     BuildFrom<const std::initializer_list<int>>(init_list);
   }
@@ -208,7 +234,25 @@ class RuntimeShape {
     return buffer_size;
   }
 
+  bool operator!=(const RuntimeShape& comp) const { return !((*this) == comp); }
+
  private:
+  // For use only by ExtendFrom(), written to guarantee (return-value) copy
+  // elision in C++17.
+  // This creates a shape padded to the desired size with the specified value.
+  RuntimeShape(int new_shape_size, const RuntimeShape& shape, int pad_value)
+      : size_(0) {
+    TFLITE_CHECK_GE(new_shape_size, shape.DimensionsCount());
+    TFLITE_CHECK_LE(new_shape_size, kMaxSmallSize);
+    Resize(new_shape_size);
+    const int size_increase = new_shape_size - shape.DimensionsCount();
+    for (int i = 0; i < size_increase; ++i) {
+      SetDim(i, pad_value);
+    }
+    std::memcpy(DimsData() + size_increase, shape.DimsData(),
+                sizeof(int32) * shape.DimensionsCount());
+  }
+
   int32 size_;
   union {
     int32 dims_[kMaxSmallSize];
@@ -234,7 +278,9 @@ inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
 
 // Gets next index to iterate through a multidimensional array.
 inline bool NextIndex(const int num_dims, const int* dims, int* current) {
-  TFLITE_DCHECK_GT(num_dims, 0);
+  if (num_dims == 0) {
+    return false;
+  }
   TFLITE_DCHECK(dims != nullptr);
   TFLITE_DCHECK(current != nullptr);
   int carry = 1;
@@ -261,7 +307,9 @@ inline bool NextIndex(const int num_dims, const int* dims, int* current) {
 inline size_t ReducedOutputOffset(const int num_dims, const int* dims,
                                   const int* index, const int num_axis,
                                   const int* axis) {
-  TFLITE_DCHECK_GT(num_dims, 0);
+  if (num_dims == 0) {
+    return 0;
+  }
   TFLITE_DCHECK(dims != nullptr);
   TFLITE_DCHECK(index != nullptr);
   size_t offset = 0;
@@ -364,6 +412,7 @@ inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
 // arrays.
 inline int MatchingFlatSize(const RuntimeShape& shape,
                             const RuntimeShape& check_shape_0) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
   const int dims_count = shape.DimensionsCount();
   for (int i = 0; i < dims_count; ++i) {
     TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
@@ -374,6 +423,7 @@ inline int MatchingFlatSize(const RuntimeShape& shape,
 inline int MatchingFlatSize(const RuntimeShape& shape,
                             const RuntimeShape& check_shape_0,
                             const RuntimeShape& check_shape_1) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
   const int dims_count = shape.DimensionsCount();
   for (int i = 0; i < dims_count; ++i) {
     TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
@@ -385,6 +435,7 @@ inline int MatchingFlatSize(const RuntimeShape& shape,
                             const RuntimeShape& check_shape_0,
                             const RuntimeShape& check_shape_1,
                             const RuntimeShape& check_shape_2) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
   const int dims_count = shape.DimensionsCount();
   for (int i = 0; i < dims_count; ++i) {
     TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
@@ -397,6 +448,7 @@ inline int MatchingFlatSize(const RuntimeShape& shape,
                             const RuntimeShape& check_shape_1,
                             const RuntimeShape& check_shape_2,
                             const RuntimeShape& check_shape_3) {
+  TFLITE_DCHECK_EQ(shape.DimensionsCount(), check_shape_0.DimensionsCount());
   const int dims_count = shape.DimensionsCount();
   for (int i = 0; i < dims_count; ++i) {
     TFLITE_DCHECK_EQ(shape.Dims(i), check_shape_0.Dims(i));
@@ -601,14 +653,74 @@ struct PoolParams {
   int stride_width;
   int filter_height;
   int filter_width;
-  // uint8, etc, inference params.
+  // uint8, etc, activation params.
   int32 quantized_activation_min;
   int32 quantized_activation_max;
-  // float inference params.
+  // float activation params.
   float float_activation_min;
   float float_activation_max;
 };
 
+enum class BroadcastableOpCategory : uint8 {
+  kNone,
+  kNonBroadcast,               // Matching input shapes.
+  kFirstInputBroadcastsFast,   // Fivefold nested loops.
+  kSecondInputBroadcastsFast,  // Fivefold nested loops.
+  kGenericBroadcast,           // Fall-back.
+};
+
+// For Add, Sub, Mul ops.
+struct ArithmeticParams {
+  // Shape dependent / common to data / op types.
+  BroadcastableOpCategory broadcast_category;
+  // uint8 inference params.
+  int32 input1_offset;
+  int32 input2_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+  // Add / Sub, not Mul, uint8 inference params.
+  int left_shift;
+  int32 input1_multiplier;
+  int input1_shift;
+  int32 input2_multiplier;
+  int input2_shift;
+  // uint8, etc, activation params.
+  int32 quantized_activation_min;
+  int32 quantized_activation_max;
+  // float activation params.
+  float float_activation_min;
+  float float_activation_max;
+
+  // Processed output dimensions.
+  // Let input "a" be the one that broadcasts in the faster-changing dimension.
+  // Then, after coalescing, for shapes {a0, a1, a2, a3, a4} and
+  // {b0, b1, b2, b3, b4},
+  // broadcast_shape[4] = b0 = a0.
+  // broadcast_shape[3] = b1; a1 = 1.
+  // broadcast_shape[2] = b2 = a2.
+  // broadcast_shape[1] = a3; b3 = 1.
+  // broadcast_shape[0] = b4 = a4.
+  int broadcast_shape[5];
+};
+
+template <typename T>
+inline void SetActivationParams(T min, T max, ArithmeticParams* params);
+
+template <>
+inline void SetActivationParams(float min, float max,
+                                ArithmeticParams* params) {
+  params->float_activation_min = min;
+  params->float_activation_max = max;
+}
+
+template <>
+inline void SetActivationParams(int32 min, int32 max,
+                                ArithmeticParams* params) {
+  params->quantized_activation_min = min;
+  params->quantized_activation_max = max;
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index 25d2dc2cdd699b4d9c8e83eb848fce0df3c59c15..69523b02cce0547fe87873e924deabb50cbeb4e5 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -50,7 +50,6 @@ limitations under the License.
 //     Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
 //     A flattened tensor represents projected bit vectors.
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 4dfc8915489f9dcf243b13bc10afcef278779a93..ba251c451e549a09d265fc43fed7dc7eb6896d61 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -97,7 +96,7 @@ constexpr int kCellStateTensor = 1;
 constexpr int kOutputTensor = 2;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMFullKernel;
   context->AddTensors(context, /*tensors_to_add=*/7,
                       &op_data->scratch_tensor_index);
@@ -847,7 +846,7 @@ enum OutputTensor {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->kernel_type = kTfLiteLSTMBasicKernel;
   // `scratch_tensor_index` is unused in this kernel.
   op_data->scratch_tensor_index = -1;
diff --git a/tensorflow/contrib/lite/kernels/one_hot.cc b/tensorflow/contrib/lite/kernels/one_hot.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ff3dca932d4284321b299cfc79571c43fce7155
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/one_hot.cc
@@ -0,0 +1,199 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace one_hot {
+
+constexpr int kIndicesTensor = 0;
+constexpr int kDepthTensor = 1;
+constexpr int kOnValueTensor = 2;
+constexpr int kOffValueTensor = 3;
+constexpr int kOutputTensor = 0;
+
+// Convenience utility for destructuring a node into the appropriate tensors and
+// data for the op. Note that this destructuring is quite cheap, so we can avoid
+// allocating op-specific, persistent data on the heap.
+struct OneHotContext {
+  OneHotContext(TfLiteContext* context, TfLiteNode* node) {
+    indices = GetInput(context, node, kIndicesTensor);
+    depth = GetInput(context, node, kDepthTensor);
+    on_value = GetInput(context, node, kOnValueTensor);
+    off_value = GetInput(context, node, kOffValueTensor);
+    output = GetOutput(context, node, kOutputTensor);
+
+    const auto* params =
+        reinterpret_cast<TfLiteOneHotParams*>(node->builtin_data);
+    const int indices_dims = indices->dims->size;
+    axis = (params->axis == -1) ? indices_dims : params->axis;
+    output_dims = indices_dims + 1;
+    dtype = on_value->type;
+  }
+
+  const TfLiteTensor* indices;
+  const TfLiteTensor* depth;
+  const TfLiteTensor* on_value;
+  const TfLiteTensor* off_value;
+  TfLiteTensor* output;
+  int axis;
+  int output_dims;
+  TfLiteType dtype;
+};
+
+template <typename T, typename TI>
+void OneHotComputeImpl(const OneHotContext& op_context) {
+  // prefix_dim_size == # of elements before the axis
+  // depth == # of elements per axis
+  // suffix_dim_size == # of elements after the axis
+  int prefix_dim_size = 1;
+  for (int i = 0; i < op_context.axis; ++i) {
+    prefix_dim_size *= op_context.indices->dims->data[i];
+  }
+  const int suffix_dim_size = NumElements(op_context.indices) / prefix_dim_size;
+  const int depth = *op_context.depth->data.i32;
+
+  const T on_value = *GetTensorData<T>(op_context.on_value);
+  const T off_value = *GetTensorData<T>(op_context.off_value);
+
+  // View the indices as a matrix of size:
+  //     prefix_dim_size x suffix_dim_size
+  // View the output as a matrix of size:
+  //     prefix_dim_size x depth x suffix_dim_size
+  // Then the output is:
+  //     output(i, j, k) == (indices(i, k) == j) ? on : off
+  T* output = GetTensorData<T>(op_context.output);
+  const TI* indices = GetTensorData<TI>(op_context.indices);
+  for (int i = 0; i < prefix_dim_size; ++i) {
+    for (int j = 0; j < depth; ++j) {
+      for (int k = 0; k < suffix_dim_size; ++k, ++output) {
+        *output = static_cast<int>(indices[i * suffix_dim_size + k]) == j
+                      ? on_value
+                      : off_value;
+      }
+    }
+  }
+}
+
+template <typename T>
+void OneHotCompute(const OneHotContext& op_context) {
+  if (op_context.indices->type == kTfLiteInt64) {
+    OneHotComputeImpl<T, int64_t>(op_context);
+  } else {
+    OneHotComputeImpl<T, int>(op_context);
+  }
+}
+
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const OneHotContext& op_context) {
+  TF_LITE_ENSURE(context, *op_context.depth->data.i32 >= 0);
+  TfLiteIntArray* output_size = TfLiteIntArrayCreate(op_context.output_dims);
+  for (int i = 0; i < op_context.output_dims; ++i) {
+    if (i < op_context.axis) {
+      output_size->data[i] = op_context.indices->dims->data[i];
+    } else if (i == op_context.axis) {
+      output_size->data[i] = *op_context.depth->data.i32;
+    } else {
+      output_size->data[i] = op_context.indices->dims->data[i - 1];
+    }
+  }
+  return context->ResizeTensor(context, op_context.output, output_size);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  OneHotContext op_context{context, node};
+  switch (op_context.dtype) {
+    // TODO(b/111744875): Support uint8 and quantization.
+    case kTfLiteFloat32:
+    case kTfLiteInt16:
+    case kTfLiteInt32:
+    case kTfLiteInt64:
+    case kTfLiteBool:
+      op_context.output->type = op_context.dtype;
+      break;
+    default:
+      context->ReportError(context, "Unknown output data type: %d",
+                           op_context.dtype);
+      return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context, op_context.indices->type == kTfLiteInt32 ||
+                              op_context.indices->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, op_context.axis >= 0 &&
+                              op_context.axis < op_context.output_dims);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.depth), 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.on_value), 1);
+  TF_LITE_ENSURE_EQ(context, NumElements(op_context.off_value), 1);
+  TF_LITE_ENSURE_EQ(context, op_context.on_value->type, op_context.dtype);
+  TF_LITE_ENSURE_EQ(context, op_context.off_value->type, op_context.dtype);
+
+  if (!IsConstantTensor(op_context.depth)) {
+    SetTensorToDynamic(op_context.output);
+    return kTfLiteOk;
+  }
+
+  return ResizeOutputTensor(context, op_context);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OneHotContext op_context{context, node};
+
+  if (IsDynamicTensor(op_context.output)) {
+    ResizeOutputTensor(context, op_context);
+  }
+
+  switch (op_context.output->type) {
+    case kTfLiteFloat32:
+      OneHotCompute<float>(op_context);
+      break;
+    case kTfLiteInt32:
+      OneHotCompute<int>(op_context);
+      break;
+    case kTfLiteInt64:
+      OneHotCompute<int64_t>(op_context);
+      break;
+    case kTfLiteBool:
+      OneHotCompute<bool>(op_context);
+      break;
+    default:
+      return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace one_hot
+
+TfLiteRegistration* Register_ONE_HOT() {
+  static TfLiteRegistration r = {
+      nullptr,
+      nullptr,
+      one_hot::Prepare,
+      one_hot::Eval,
+  };
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/one_hot_test.cc b/tensorflow/contrib/lite/kernels/one_hot_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6b604ec7a7f86b333805d91a95cb5054f0257ae4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/one_hot_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <initializer_list>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class OneHotOpModel : public SingleOpModel {
+ public:
+  OneHotOpModel(std::initializer_list<int> input_shape, int depth_value,
+                TensorType dtype, int axis = -1, T on_value = 1,
+                T off_value = 0, TensorType indices_type = TensorType_INT32) {
+    indices_ = AddInput(indices_type);
+    int depth = AddInput(TensorType_INT32);
+    int on = AddInput(dtype);
+    int off = AddInput(dtype);
+    output_ = AddOutput(dtype);
+    SetBuiltinOp(BuiltinOperator_ONE_HOT, BuiltinOptions_OneHotOptions,
+                 CreateOneHotOptions(builder_, axis).Union());
+    BuildInterpreter({input_shape});
+
+    PopulateTensor<int>(depth, {depth_value});
+    PopulateTensor<T>(on, {on_value});
+    PopulateTensor<T>(off, {off_value});
+  }
+
+  template <typename TI>
+  void SetIndices(std::initializer_list<TI> data) {
+    PopulateTensor<TI>(indices_, data);
+  }
+
+  TfLiteStatus InvokeWithResult() { return interpreter_->Invoke(); }
+
+  int32_t GetOutputSize() { return GetTensorSize(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int indices_;
+  int output_;
+};
+
+TEST(OneHotOpTest, BasicFloat) {
+  const int depth = 3;
+  OneHotOpModel<float> model({3}, depth, TensorType_FLOAT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1.f, 0.f, 0.f, 0.f, 1.f, 0.f, 0.f, 0.f, 1.f}));
+}
+
+TEST(OneHotOpTest, BasicInt) {
+  const int depth = 3;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1}));
+}
+
+TEST(OneHotOpTest, BasicBool) {
+  const int depth = 3;
+  OneHotOpModel<bool> model({3}, depth, TensorType_BOOL);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({true, false, false, false, true, false, false,
+                                false, true}));
+}
+
+TEST(OneHotOpTest, SmallDepth) {
+  const int depth = 1;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32);
+  model.SetIndices({0, 1, 2});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 1}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0}));
+}
+
+TEST(OneHotOpTest, BigDepth) {
+  const int depth = 4;
+  OneHotOpModel<int> model({2}, depth, TensorType_INT32);
+  model.SetIndices({0, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 0, 1, 0, 0}));
+}
+
+TEST(OneHotOpTest, OnOffValues) {
+  const int depth = 3;
+  const int axis = -1;
+  const int on = 5;
+  const int off = 0;
+  OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off);
+  model.SetIndices({0, 2, -1, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({4, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0}));
+}
+
+TEST(OneHotOpTest, ZeroAxis) {
+  const int depth = 3;
+  const int axis = 0;
+  const int on = 5;
+  const int off = 0;
+  OneHotOpModel<int> model({4}, depth, TensorType_INT32, axis, on, off);
+  model.SetIndices({0, 2, -1, 1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 4}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 0, 0, 0, 0, 0, 0, 5, 0, 5, 0, 0}));
+}
+
+TEST(OneHotOpTest, MultiDimensionalIndices) {
+  const int depth = 3;
+  const int axis = -1;
+  const float on = 2;
+  const float off = 0;
+  OneHotOpModel<float> model({2, 2}, depth, TensorType_FLOAT32, axis, on, off);
+  model.SetIndices({0, 2, 1, -1});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 3}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0}));
+}
+
+TEST(OneHotOpTest, Int64Indices) {
+  const int depth = 3;
+  const int axis = -1;
+  const int on = 1;
+  const int off = 0;
+  OneHotOpModel<int> model({3}, depth, TensorType_INT32, axis, on, off,
+                           TensorType_INT64);
+  std::initializer_list<int64_t> indices = {0, 1, 2};
+  model.SetIndices(indices);
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({3, 3}));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0, 0, 0, 1, 0, 0, 0, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pack.cc b/tensorflow/contrib/lite/kernels/pack.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bb3416f6a6ca60250f137986e479e8f1085e2558
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pack.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace pack {
+namespace {
+
+constexpr int kOutputTensor = 0;
+
+// Op data for pack op.
+struct OpData {
+  int values_count;
+  int axis;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->axis = 0;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), data->values_count);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input0 = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, NumDimensions(input0) < 4);
+  TF_LITE_ENSURE(context, NumDimensions(input0) >= data->axis);
+  // TODO(renjieliu): Support negative axis.
+  TF_LITE_ENSURE(context, data->axis >= 0);
+  if (input0->type != kTfLiteInt32 && input0->type != kTfLiteFloat32) {
+    context->ReportError(context,
+                         "Currently pack only supports int32 and float32.");
+    return kTfLiteError;
+  }
+  // Make sure all inputs have the same shape and type.
+  for (int i = 1; i < data->values_count; ++i) {
+    const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, HaveSameShapes(input0, input));
+    TF_LITE_ENSURE_EQ(context, input0->type, input->type);
+  }
+
+  // Resize output. rank R will become rank R + 1
+  const int dimension_size = NumDimensions(input0) + 1;
+  const TfLiteIntArray* input_shape = input0->dims;
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dimension_size);
+  int i = 0;
+  for (int index = 0; index < dimension_size; ++index) {
+    if (index == data->axis) {
+      output_shape->data[index] = data->values_count;
+    } else {
+      output_shape->data[index] = input_shape->data[i++];
+    }
+  }
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, output->type, input0->type);
+
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+template <typename T>
+void PackImpl(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* output,
+              int values_count, int axis) {
+  VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  reference_ops::Pack<T>(RemapDim(NumDimensions(output), axis),
+                         all_inputs.data(), all_inputs.dims(), values_count,
+                         GetTensorData<T>(output), GetTensorDims(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const OpData* data = reinterpret_cast<OpData*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  switch (output->type) {
+    case kTfLiteFloat32: {
+      PackImpl<float>(context, node, output, data->values_count, data->axis);
+      break;
+    }
+    case kTfLiteInt32: {
+      PackImpl<int32_t>(context, node, output, data->values_count, data->axis);
+      break;
+    }
+    default: {
+      context->ReportError(context,
+                           "Currently pack only supports int32 and float32.");
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+}  // namespace pack
+
+TfLiteRegistration* Register_PACK() {
+  static TfLiteRegistration r = {pack::Init, pack::Free, pack::Prepare,
+                                 pack::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pack_test.cc b/tensorflow/contrib/lite/kernels/pack_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..485a50ad3ac493fd02f619f7d7cbaf10d3a6aff0
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/pack_test.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class PackOpModel : public SingleOpModel {
+ public:
+  PackOpModel(const TensorData& input_template, int axis, int values_count) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < values_count; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(BuiltinOperator_PACK, BuiltinOptions_PackOptions,
+                 CreatePackOptions(builder_, values_count, axis).Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+  void SetInput(int index, std::initializer_list<T> data) {
+    PopulateTensor(index, data);
+  }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int output_;
+};
+
+TEST(PackOpTest, FloatThreeInputs) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, FloatThreeInputsDifferentAxis) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, FloatMultilDimensions) {
+  PackOpModel<float> model({TensorType_FLOAT32, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
+TEST(PackOpTest, IntThreeInputs) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2}}, 0, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3, 2));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 4, 2, 5, 3, 6}));
+}
+
+TEST(PackOpTest, IntThreeInputsDifferentAxis) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2}}, 1, 3);
+  model.SetInput(0, {1, 4});
+  model.SetInput(1, {2, 5});
+  model.SetInput(2, {3, 6});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(PackOpTest, IntMultilDimensions) {
+  PackOpModel<int32_t> model({TensorType_INT32, {2, 3}}, 1, 2);
+  model.SetInput(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(2, 2, 3));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index 9b0487ae16e6546ac39b5d75ecf9d14ae4e4e4cf..29a5be068368365e67ad0653b775afe1e976f23a 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/reduce.cc b/tensorflow/contrib/lite/kernels/reduce.cc
index 52e4084ff8641c65773c12c12f56631c8099cdae..e99f67c7258c555903069dff67a86a3703249c7c 100644
--- a/tensorflow/contrib/lite/kernels/reduce.cc
+++ b/tensorflow/contrib/lite/kernels/reduce.cc
@@ -78,6 +78,10 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context, OpContext* op_context) {
   size_t num_axis = NumElements(op_context->axis);
   const TfLiteIntArray* input_dims = op_context->input->dims;
   int input_num_dims = NumDimensions(op_context->input);
+  if (input_num_dims == 0) {
+    return context->ResizeTensor(context, op_context->output,
+                                 TfLiteIntArrayCreate(0));
+  }
   const int* axis = GetTensorData<int>(op_context->axis);
   if (op_context->params->keep_dims) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_num_dims);
diff --git a/tensorflow/contrib/lite/kernels/reduce_test.cc b/tensorflow/contrib/lite/kernels/reduce_test.cc
index 7d28931ecdf62ff6fcd474bf8a9790c8f4766bdd..5d432d34ef5118e7164d7f767dad6017aa640e51 100644
--- a/tensorflow/contrib/lite/kernels/reduce_test.cc
+++ b/tensorflow/contrib/lite/kernels/reduce_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
 
 class BaseOpModel : public SingleOpModel {
  public:
@@ -197,6 +198,16 @@ TEST(ConstFloatMeanOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
 
+TEST(ConstFloatMeanOpTest, Scalar) {
+  std::vector<float> data = {3.27};
+  MeanOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {},
+                     {0}, true);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({3.27})));
+}
+
 TEST(DynamicFloatMeanOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -342,6 +353,16 @@ TEST(DynamicFloatSumOpTest, NotKeepDims) {
               ElementsAreArray(ArrayFloatNear({144, 156})));
 }
 
+TEST(ConstFloatSumOpTest, Scalar) {
+  std::vector<float> data = {17.};
+  SumOpConstModel m({TensorType_FLOAT32, {}}, {TensorType_FLOAT32, {}}, {}, {0},
+                    false);
+  m.SetInput(data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({17.})));
+}
+
 TEST(DynamicFloatSumOpTest, KeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -630,6 +651,20 @@ TEST(DynamicUint8MaxOpTest, KeepDims) {
                   ArrayFloatNear({11.1294, 0.862745}, kQuantizedTolerance)));
 }
 
+TEST(DynamicUint8MaxOpTest, Scalar) {
+  float kQuantizedTolerance = GetTolerance(-10.0, 12.0);
+  std::vector<float> data = {11.14};
+  MaxOpDynamicModel m({TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_UINT8, {}, -10.0, 12.0},
+                      {TensorType_INT32, {1}}, true);
+  std::vector<int> axis = {0};
+  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear({11.1294}, kQuantizedTolerance)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index f0f2757277a9ed41490726d4e22fe65207928bca..da69b8504112d65c83b04b00a946fe7980761440 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -106,6 +106,8 @@ TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
+TfLiteRegistration* Register_PACK();
+TfLiteRegistration* Register_ONE_HOT();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -195,6 +197,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index 3287040695140e3e7921c9f517450b9416b050b6..99ecc16093372fcda3257eef6099af64770c1c27 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -25,16 +25,11 @@ namespace builtin {
 namespace reshape {
 
 constexpr int kInputTensor = 0;
+constexpr int kShapeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
-
-  // TODO(ahentz): we are often given a tensor with the shape but we only pay
-  // attention to what the shape specified in 'params'.
-  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node,
+                          TfLiteIntArray* output_shape) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
@@ -47,32 +42,76 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     num_input_elements *= SizeOfDimension(input, i);
   }
 
-  TfLiteIntArray* output_size = TfLiteIntArrayCreate(params->num_dimensions);
   int num_output_elements = 1;
   int stretch_dim = -1;
-  for (int i = 0; i < params->num_dimensions; ++i) {
-    int value = params->shape[i];
+  for (int i = 0; i < output_shape->size; ++i) {
+    int value = output_shape->data[i];
     if (value == -1) {
       TF_LITE_ENSURE_EQ(context, stretch_dim, -1);
       stretch_dim = i;
     } else {
       num_output_elements *= value;
-      output_size->data[i] = value;
     }
   }
   if (stretch_dim != -1) {
-    output_size->data[stretch_dim] = num_input_elements / num_output_elements;
-    num_output_elements *= output_size->data[stretch_dim];
+    output_shape->data[stretch_dim] = num_input_elements / num_output_elements;
+    num_output_elements *= output_shape->data[stretch_dim];
   }
 
   TF_LITE_ENSURE_EQ(context, num_input_elements, num_output_elements);
-  return context->ResizeTensor(context, output, output_size);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutputWithShapeTensor(TfLiteContext* context,
+                                         TfLiteNode* node) {
+  const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    output_shape->data[i] = shape->data.i32[i];
+  }
+  return ResizeOutput(context, node, output_shape);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
+
+  TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  // Attempt to use shape tensor if it exists.
+  if (NumInputs(node) == 2) {
+    const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+    // Check if the shape tensor is valid.
+    if (shape->dims->size == 1 && shape->type == kTfLiteInt32) {
+      // Set the output tensor as dynamic if the shape isn't constnat.
+      if (!IsConstantTensor(shape)) {
+        TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+        SetTensorToDynamic(output);
+        return kTfLiteOk;
+      }
+      // Shape is constant. Resize now.
+      return ResizeOutputWithShapeTensor(context, node);
+    }
+  }
+  // The function is returned above this line if the shape tensor is usable.
+  // Now fallback to the shape parameter in `TfLiteReshapeParams`.
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(params->num_dimensions);
+  for (int i = 0; i < params->num_dimensions; ++i) {
+    output_shape->data[i] = params->shape[i];
+  }
+  return ResizeOutput(context, node, output_shape);
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutputWithShapeTensor(context, node));
+  }
+
   memcpy(output->data.raw, input->data.raw, input->bytes);
 
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 10caffea03ebcec7862df1627541ac3d076b04e4..f4289105f7931ae572f219a61b5479287aff926a 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -247,7 +247,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
       3, 6,   //
       9, 12,  //
       4, 10,  //
-      10, 16  //
+      12, 16  //
   });
   m.SetSize({3, 3});
   m.Invoke();
@@ -256,8 +256,8 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                         7, 9, 10,    //
                                         9, 11, 12,   //
                                         4, 8, 10,    //
-                                        8, 12, 14,   //
-                                        10, 13, 16,  //
+                                        9, 12, 14,   //
+                                        12, 14, 16,  //
                                     })));
 
   ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
@@ -265,7 +265,7 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
       3, 6,   //
       9, 12,  //
       4, 10,  //
-      10, 16  //
+      12, 16  //
   });
   const_m.Invoke();
   EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
@@ -273,35 +273,35 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
                                               7, 9, 10,    //
                                               9, 11, 12,   //
                                               4, 8, 10,    //
-                                              8, 12, 14,   //
-                                              10, 13, 16,  //
+                                              9, 12, 14,   //
+                                              12, 14, 16,  //
                                           })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
   ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
   m.SetInput<uint8>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
   });
   m.SetSize({3, 3});
   m.Invoke();
   EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                        3, 4, 5, 8, 6, 10,      //
-                                        7, 8, 9, 12, 10, 14,    //
-                                        9, 10, 11, 13, 12, 16,  //
+                                        3, 4, 5, 8, 6, 10,       //
+                                        7, 9, 10, 12, 11, 14,    //
+                                        10, 12, 12, 14, 14, 16,  //
                                     })));
 
   ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
   const_m.SetInput<uint8>({
-      3, 4, 6, 10,    //
-      9, 10, 12, 16,  //
+      3, 4, 6, 10,     //
+      10, 12, 14, 16,  //
   });
   const_m.Invoke();
   EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
-                                              3, 4, 5, 8, 6, 10,      //
-                                              7, 8, 9, 12, 10, 14,    //
-                                              9, 10, 11, 13, 12, 16,  //
+                                              3, 4, 5, 8, 6, 10,       //
+                                              7, 9, 10, 12, 11, 14,    //
+                                              10, 12, 12, 14, 14, 16,  //
                                           })));
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index c9269599e58f095ded4788e2ab064583ae0a708c..03079f1c3b4110da9193f91ed22940594152b10f 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -113,7 +113,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
   }
 
-#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar)                        \
+#define TF_LITE_SPACE_TO_BATCH_ND(type, scalar, pad_value)             \
   type::SpaceToBatchND(GetTensorData<scalar>(op_context.input),        \
                        GetTensorDims(op_context.input),                \
                        GetTensorData<int32_t>(op_context.block_shape), \
@@ -121,34 +121,36 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        GetTensorData<int32_t>(op_context.paddings),    \
                        GetTensorDims(op_context.paddings),             \
                        GetTensorData<scalar>(op_context.output),       \
-                       GetTensorDims(op_context.output))
+                       GetTensorDims(op_context.output), pad_value)
   switch (op_context.input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, float, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, float, 0);
       }
       break;
     case kTfLiteUInt8:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, uint8_t,
+                                  op_context.output->params.zero_point);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, uint8_t,
+                                  op_context.output->params.zero_point);
       }
       break;
     case kTfLiteInt32:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int32_t, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int32_t, 0);
       }
       break;
     case kTfLiteInt64:
       if (kernel_type == kReference) {
-        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t);
+        TF_LITE_SPACE_TO_BATCH_ND(reference_ops, int64_t, 0);
       } else {
-        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t);
+        TF_LITE_SPACE_TO_BATCH_ND(optimized_ops, int64_t, 0);
       }
       break;
     default:
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
index 92a4a037d5873e608ee7bdbdfc5eaa5e9b62bc8c..5756573629a51917e39a312117a1fcd29c150dc0 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd_test.cc
@@ -23,6 +23,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::Matcher;
 
 class SpaceToBatchNDOpModel : public SingleOpModel {
  public:
@@ -30,6 +31,10 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
     PopulateTensor<float>(input_, data);
   }
 
+  void SetQuantizedInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   void SetBlockShape(std::initializer_list<int> data) {
     PopulateTensor<int>(block_shape_, data);
   }
@@ -41,6 +46,11 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
  protected:
   int input_;
   int block_shape_;
@@ -56,18 +66,19 @@ class SpaceToBatchNDOpModel : public SingleOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpConstModel(std::initializer_list<int> input_shape,
+  SpaceToBatchNDOpConstModel(const TensorData& input,
                              std::initializer_list<int> block_shape,
-                             std::initializer_list<int> paddings) {
-    input_ = AddInput(TensorType_FLOAT32);
+                             std::initializer_list<int> paddings,
+                             const TensorData& output) {
+    input_ = AddInput(input);
     block_shape_ = AddConstInput(TensorType_INT32, block_shape, {2});
     paddings_ = AddConstInput(TensorType_INT32, paddings, {2, 2});
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
                  BuiltinOptions_SpaceToBatchNDOptions,
                  CreateSpaceToBatchNDOptions(builder_).Union());
-    BuildInterpreter({input_shape});
+    BuildInterpreter({input.shape});
   }
 };
 
@@ -81,26 +92,30 @@ class SpaceToBatchNDOpConstModel : public SpaceToBatchNDOpModel {
 //    m.Invoke();
 class SpaceToBatchNDOpDynamicModel : public SpaceToBatchNDOpModel {
  public:
-  SpaceToBatchNDOpDynamicModel(std::initializer_list<int> input_shape) {
-    input_ = AddInput(TensorType_FLOAT32);
+  SpaceToBatchNDOpDynamicModel(const TensorData& input,
+                               const TensorData& output) {
+    input_ = AddInput(input);
     block_shape_ = AddInput(TensorType_INT32);
     paddings_ = AddInput(TensorType_INT32);
-    output_ = AddOutput(TensorType_FLOAT32);
+    output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_SPACE_TO_BATCH_ND,
                  BuiltinOptions_SpaceToBatchNDOptions,
                  CreateSpaceToBatchNDOptions(builder_).Union());
-    BuildInterpreter({input_shape, {2}, {2, 2}});
+    BuildInterpreter({input.shape, {2}, {2, 2}});
   }
 };
 
 TEST(SpaceToBatchNDOpTest, InvalidShapeTest) {
-  EXPECT_DEATH(SpaceToBatchNDOpConstModel({1, 3, 3, 1}, {2, 2}, {0, 0, 0, 0}),
-               "Cannot allocate tensors");
+  EXPECT_DEATH(
+      SpaceToBatchNDOpConstModel({TensorType_FLOAT32, {1, 3, 3, 1}}, {2, 2},
+                                 {0, 0, 0, 0}, {TensorType_FLOAT32}),
+      "Cannot allocate tensors");
 }
 
 TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 4, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 4, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 2, 2, 1}));
@@ -109,7 +124,8 @@ TEST(SpaceToBatchNDOpTest, SimpleConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 4, 4, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetPaddings({0, 0, 0, 0});
@@ -120,7 +136,8 @@ TEST(SpaceToBatchNDOpTest, SimpleDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
-  SpaceToBatchNDOpConstModel m({2, 2, 4, 1}, {2, 2}, {0, 0, 0, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {2, 2, 4, 1}}, {2, 2},
+                               {0, 0, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({8, 1, 2, 1}));
@@ -129,7 +146,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({2, 2, 4, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {2, 2, 4, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
   m.SetPaddings({0, 0, 0, 0});
@@ -140,7 +158,8 @@ TEST(SpaceToBatchNDOpTest, MultipleInputBatchesDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 5, 2, 1}, {3, 2}, {1, 0, 2, 0});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 5, 2, 1}}, {3, 2},
+                               {1, 0, 2, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
@@ -151,7 +170,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 5, 2, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 5, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 0, 2, 0});
@@ -164,7 +184,8 @@ TEST(SpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
-  SpaceToBatchNDOpConstModel m({1, 4, 2, 1}, {3, 2}, {1, 1, 2, 4});
+  SpaceToBatchNDOpConstModel m({TensorType_FLOAT32, {1, 4, 2, 1}}, {3, 2},
+                               {1, 1, 2, 4}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
   m.Invoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
@@ -176,7 +197,8 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingConstTest) {
 }
 
 TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
-  SpaceToBatchNDOpDynamicModel m({1, 4, 2, 1});
+  SpaceToBatchNDOpDynamicModel m({TensorType_FLOAT32, {1, 4, 2, 1}},
+                                 {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8});
   m.SetBlockShape({3, 2});
   m.SetPaddings({1, 1, 2, 4});
@@ -189,6 +211,88 @@ TEST(SpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
                              }));
 }
 
+class QuantizedSpaceToBatchNDOpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(SpaceToBatchNDOpConstModel m(
+                   {TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                   {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_UINT8, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 0, 2, 0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, SimplePaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 5, 2, 1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 0.1});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 0, 2, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 2, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -0.1, 0, -0.7,
+                   0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 0.1},
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingConstTest) {
+  SpaceToBatchNDOpConstModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
+                               {3, 2}, {1, 1, 2, 4},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {
+                      0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
+                      0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0,
+                      0, -0.3, 0, 0, 0, 0,    0, 0, 0, 0.4, 0, 0, 0, 0,   0, 0,
+                  },
+                  -1.0, 1.0)));
+}
+
+TEST_F(QuantizedSpaceToBatchNDOpTest, ComplexPaddingDynamicTest) {
+  SpaceToBatchNDOpDynamicModel m({TensorType_UINT8, {1, 4, 2, 1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.1, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8});
+  m.SetBlockShape({3, 2});
+  m.SetPaddings({1, 1, 2, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({6, 2, 4, 1}));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {
+                      0, 0,    0, 0, 0, -0.5, 0, 0, 0, 0,   0, 0, 0, 0.6, 0, 0,
+                      0, -0.1, 0, 0, 0, -0.7, 0, 0, 0, 0.2, 0, 0, 0, 0.8, 0, 0,
+                      0, -0.3, 0, 0, 0, 0,    0, 0, 0, 0.4, 0, 0, 0, 0,   0, 0,
+                  },
+                  -1.0, 1.0)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
index 404c32ad9ca8b9f1e467b747708ccb451f2a5118..7be5e66c166cd752fc325f25d38e6522948e0f06 100644
--- a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 541c85f756b052013d5eb6706ca984465c737271..77a1f596898bb7fa99a7509a25229c627d762bdd 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -81,40 +81,43 @@ template <KernelType kernel_type>
 void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
              const OpData* data, const TfLiteTensor* input1,
              const TfLiteTensor* input2, TfLiteTensor* output) {
-#define TF_LITE_SUB(type, opname, data_type)                            \
-  data_type output_activation_min, output_activation_max;               \
-  CalculateActivationRange(params->activation, &output_activation_min,  \
-                           &output_activation_max);                     \
-  type::opname(GetTensorData<data_type>(input1), GetTensorDims(input1), \
-               GetTensorData<data_type>(input2), GetTensorDims(input2), \
-               output_activation_min, output_activation_max,            \
-               GetTensorData<data_type>(output), GetTensorDims(output))
+#define TF_LITE_SUB(type, opname, data_type)                             \
+  data_type output_activation_min, output_activation_max;                \
+  CalculateActivationRange(params->activation, &output_activation_min,   \
+                           &output_activation_max);                      \
+  tflite::ArithmeticParams op_params;                                    \
+  SetActivationParams(output_activation_min, output_activation_max,      \
+                      &op_params);                                       \
+  type::opname(op_params, GetTensorShape(input1),                        \
+               GetTensorData<data_type>(input1), GetTensorShape(input2), \
+               GetTensorData<data_type>(input2), GetTensorShape(output), \
+               GetTensorData<data_type>(output))
   if (output->type == kTfLiteInt32) {
     if (kernel_type == kReference) {
       if (data->requires_broadcast) {
-        TF_LITE_SUB(reference_ops, BroadcastSub, int32_t);
+        TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, int32_t);
       } else {
-        TF_LITE_SUB(reference_ops, Sub, int32_t);
+        TF_LITE_SUB(reference_ops, SubWithActivation, int32_t);
       }
     } else {
       if (data->requires_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastSub, int32_t);
+        TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, int32_t);
       } else {
-        TF_LITE_SUB(optimized_ops, Sub, int32_t);
+        TF_LITE_SUB(optimized_ops, SubWithActivation, int32_t);
       }
     }
   } else if (output->type == kTfLiteFloat32) {
     if (kernel_type == kReference) {
       if (data->requires_broadcast) {
-        TF_LITE_SUB(reference_ops, BroadcastSub, float);
+        TF_LITE_SUB(reference_ops, BroadcastSub4DSlow, float);
       } else {
-        TF_LITE_SUB(reference_ops, Sub, float);
+        TF_LITE_SUB(reference_ops, SubWithActivation, float);
       }
     } else {
       if (data->requires_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastSub, float);
+        TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow, float);
       } else {
-        TF_LITE_SUB(optimized_ops, Sub, float);
+        TF_LITE_SUB(optimized_ops, SubWithActivation, float);
       }
     }
   }
@@ -143,36 +146,43 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   int input1_shift;
   QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
                                       &input1_multiplier, &input1_shift);
-  input1_shift *= -1;
   int32 input2_multiplier;
   int input2_shift;
   QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
                                       &input2_multiplier, &input2_shift);
-  input2_shift *= -1;
   int32 output_multiplier;
   int output_shift;
   QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
                                       &output_multiplier, &output_shift);
-  output_shift *= -1;
 
   int32 output_activation_min, output_activation_max;
   CalculateActivationRangeUint8(params->activation, output,
                                 &output_activation_min, &output_activation_max);
 
-#define TF_LITE_SUB(type, opname)                                            \
-  type::opname(left_shift, GetTensorData<uint8_t>(input1),                   \
-               GetTensorDims(input1), input1_offset, input1_multiplier,      \
-               input1_shift, GetTensorData<uint8_t>(input2),                 \
-               GetTensorDims(input2), input2_offset, input2_multiplier,      \
-               input2_shift, output_offset, output_multiplier, output_shift, \
-               output_activation_min, output_activation_max,                 \
-               GetTensorData<uint8_t>(output), GetTensorDims(output));
+#define TF_LITE_SUB(type, opname)                                      \
+  tflite::ArithmeticParams op_params;                                  \
+  op_params.left_shift = left_shift;                                   \
+  op_params.input1_offset = input1_offset;                             \
+  op_params.input1_multiplier = input1_multiplier;                     \
+  op_params.input1_shift = input1_shift;                               \
+  op_params.input2_offset = input2_offset;                             \
+  op_params.input2_multiplier = input2_multiplier;                     \
+  op_params.input2_shift = input2_shift;                               \
+  op_params.output_offset = output_offset;                             \
+  op_params.output_multiplier = output_multiplier;                     \
+  op_params.output_shift = output_shift;                               \
+  SetActivationParams(output_activation_min, output_activation_max,    \
+                      &op_params);                                     \
+  type::opname(op_params, GetTensorShape(input1),                      \
+               GetTensorData<uint8_t>(input1), GetTensorShape(input2), \
+               GetTensorData<uint8_t>(input2), GetTensorShape(output), \
+               GetTensorData<uint8_t>(output))
   // The quantized version of Sub doesn't support activations, so we
   // always use BroadcastSub.
   if (kernel_type == kReference) {
-    TF_LITE_SUB(reference_ops, BroadcastSub);
+    TF_LITE_SUB(reference_ops, BroadcastSub4DSlow);
   } else {
-    TF_LITE_SUB(optimized_ops, BroadcastSub);
+    TF_LITE_SUB(optimized_ops, BroadcastSub4DSlow);
   }
 #undef TF_LITE_SUB
 }
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index 22eebdd4ceb16aeabc5e799c708f7236b3e2be37..6d4912ce3aa40bf95dc1e26572b8a07fb6362744 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // SVDF op that compresses a fully connected op via low-rank matrix
 // factorization. See https://research.google.com/pubs/archive/43813.pdf for
 // details.
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
@@ -105,7 +104,7 @@ constexpr int kStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+  auto* op_data = new OpData();
   op_data->float_weights_time_initialized = false;
   context->AddTensors(context, /*tensors_to_add=*/4,
                       &op_data->scratch_tensor_index);
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
index 8b9deeed20d761876d526c07eb78b602ca7314dc..a9baa5c6988877ccc2e007e5fefdc980d7a3a679 100644
--- a/tensorflow/contrib/lite/kernels/transpose_conv.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index c48b470f929da277b09b5c58363ad9081e8966a7..0acd705950cb262bbb2625aa6143f88b429a6562 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index 164a0cbd08d6ce82a413f12ba6b1703087a30aba..0d6d29a171735a00a8dcc6cd0213a859b9f8094a 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
 #include <cassert>
 #include <cmath>
 #include <cstdio>
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 5e6106a87ed801968106ad2f53a89fb343282e0d..5814cddc5ba8d4099a449ea6e42fc031f6ef46b9 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/builtin_op_data.h"
@@ -706,6 +705,15 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = static_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_PACK: {
+      TfLitePackParams* params = MallocPOD<TfLitePackParams>();
+      if (auto* pack_params = op->builtin_options_as_PackOptions()) {
+        params->values_count = pack_params->values_count();
+        params->axis = pack_params->axis();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
@@ -722,6 +730,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = static_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_ONE_HOT: {
+      auto* params = MallocPOD<TfLiteOneHotParams>();
+      if (auto* schema_params = op->builtin_options_as_OneHotOptions()) {
+        params->axis = schema_params->axis();
+      }
+      *builtin_data = static_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -764,7 +780,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_TRANSPOSE:
     case BuiltinOperator_POW:
-    case BuiltinOperator_PACK:
+    case BuiltinOperator_LOGICAL_OR:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index 15bae21a411c1241cf71ab4d3f0e0289eaac8ef3..df4f60d4ad4eb71f48eb3ad364f95f93b84f3d75 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
 
 #include "tensorflow/contrib/lite/model.h"
 
@@ -242,14 +241,6 @@ TEST(BasicFlatBufferModel, TestWithNullVerifier) {
       "tensorflow/contrib/lite/testdata/test_model.bin", nullptr));
 }
 
-struct TestErrorReporter : public ErrorReporter {
-  int Report(const char* format, va_list args) override {
-    calls++;
-    return 0;
-  }
-  int calls = 0;
-};
-
 // This makes sure the ErrorReporter is marshalled from FlatBufferModel to
 // the Interpreter.
 TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
@@ -263,7 +254,7 @@ TEST(BasicFlatBufferModel, TestCustomErrorReporter) {
   TrivialResolver resolver;
   InterpreterBuilder(*model, resolver)(&interpreter);
   ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
-  ASSERT_EQ(reporter.calls, 1);
+  ASSERT_EQ(reporter.num_calls(), 1);
 }
 
 // This makes sure the ErrorReporter is marshalled from FlatBufferModel to
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 710ce1632e5cd0ebb1302678e436842152313899..1c06b29deb541fa73dd597c7f8e465c760f1720b 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -560,6 +560,14 @@ TfLiteStatus AddOpsAndParams(
         nnapi_version = 11;  // require NNAPI 1.1
         nn_op_type = ANEURALNETWORKS_TRANSPOSE;
         break;
+      case tflite::BuiltinOperator_L2_NORMALIZATION:
+        nn_op_type = ANEURALNETWORKS_L2_NORMALIZATION;
+        if (reinterpret_cast<TfLiteL2NormParams*>(node.builtin_data)
+                ->activation != kTfLiteActNone) {
+          FATAL(
+              "NNAPI does not support L2Normalization with fused activations");
+        }
+        break;
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
@@ -568,7 +576,6 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
       case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
-      case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
       case tflite::BuiltinOperator_PADV2:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
@@ -615,6 +622,8 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_POW:
       case tflite::BuiltinOperator_FAKE_QUANT:
       case tflite::BuiltinOperator_PACK:
+      case tflite::BuiltinOperator_LOGICAL_OR:
+      case tflite::BuiltinOperator_ONE_HOT:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
index b29ca330dc36ad60add57357e01478711f41a17c..1172722f7a70771af73eb07571349e431755471c 100644
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -20,7 +20,6 @@ cc_test(
     srcs = ["profiler_test.cc"],
     copts = ["-DTFLITE_PROFILING_ENABLED"],
     defines = ["TFLITE_PROFILING_ENABLED"],
-    tags = ["no_oss"],
     deps = [
         ":profiler",
         "//tensorflow/contrib/lite/testing:util",
@@ -77,7 +76,6 @@ cc_test(
     srcs = ["profile_buffer_test.cc"],
     copts = ["-DTFLITE_PROFILING_ENABLED"],
     defines = ["TFLITE_PROFILING_ENABLED"],
-    tags = ["no_oss"],
     deps = [
         ":profile_buffer",
         "//tensorflow/contrib/lite/testing:util",
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
index 36e87b666a596e9922a4eba14321589b8bc24724..720bd717b9e3b0c45cbdbaaad2b6900edacc3051 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
@@ -23,8 +23,6 @@ namespace tflite {
 namespace profiling {
 namespace {
 
-using Detail = tensorflow::StatsCalculator::Detail;
-
 struct OperatorDetails {
   std::string name;
   std::vector<std::string> inputs;
@@ -125,28 +123,17 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t base_start_us = events[0]->begin_timestamp_us;
   int node_num = 0;
   int64_t curr_total_us = 0;
-  std::map<std::string, Detail> details;
   for (auto event : events) {
     auto op_details = GetOperatorDetails(interpreter, event->event_metadata);
     auto node_name = ToString(op_details.outputs);
-    auto result = details.emplace(node_name, Detail());
-    Detail* detail = &(result.first->second);
-    detail->start_us.UpdateStat(event->begin_timestamp_us - base_start_us);
+    int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
         event->end_timestamp_us - event->begin_timestamp_us;
-    detail->rel_end_us.UpdateStat(node_exec_time);
+    stats_calculator_->AddNodeStats(node_name, op_details.name, node_num,
+                                    start_us, node_exec_time, 0 /*memory */);
     curr_total_us += node_exec_time;
     ++node_num;
-
-    if (result.second) {
-      detail->name = node_name;
-      detail->type = op_details.name;
-      detail->run_order = node_num;
-      detail->times_called = 0;
-    }
-    ++detail->times_called;
   }
-  stats_calculator_->UpdateDetails(details);
   stats_calculator_->UpdateRunTotalUs(curr_total_us);
 }
 }  // namespace profiling
diff --git a/tensorflow/contrib/lite/profiling/time.cc b/tensorflow/contrib/lite/profiling/time.cc
index 446660bb747cd6e3b694669b64ac1d95cf415fbe..875ddb02bcfc30f4c2ef543fe1c15bec467e5410 100644
--- a/tensorflow/contrib/lite/profiling/time.cc
+++ b/tensorflow/contrib/lite/profiling/time.cc
@@ -14,16 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/profiling/time.h"
 
+#if defined(_MSC_VER)
+#include <chrono>  // NOLINT(build/c++11)
+#else
 #include <sys/time.h>
+#endif
 
 namespace tflite {
 namespace profiling {
 namespace time {
+
+#if defined(_MSC_VER)
+
+uint64_t NowMicros() {
+  return std::chrono::duration_cast<std::chrono::microseconds>(
+             std::chrono::system_clock::now().time_since_epoch())
+      .count();
+}
+
+#else
+
 uint64_t NowMicros() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
 }
+
+#endif  // defined(_MSC_VER)
+
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 727fbff38ef7b4b8ea178447a2f9492bd0be5d29..860aff9e7e2de9616dea40f42a33bc1e4ee9f400 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -20,6 +20,7 @@ py_library(
     deps = [
         "//tensorflow/contrib/lite/python/interpreter_wrapper:tensorflow_wrap_interpreter_wrapper",
         "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 0ea2630f711727787332f207bdff6383aac8097c..ec49738fb5365a16c41cc6737198b5707508a3e2 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -115,6 +115,7 @@ def build_toco_convert_protos(input_tensors,
                               inference_type=lite_constants.FLOAT,
                               inference_input_type=None,
                               input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                              input_shapes=None,
                               output_format=lite_constants.TFLITE,
                               quantized_input_stats=None,
                               default_ranges_stats=None,
@@ -141,6 +142,8 @@ def build_toco_convert_protos(input_tensors,
       Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
     input_format: Type of data to read Currently must be
       `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
+    input_shapes: Input array shape. It needs to be a list of the same length
+      as `input_tensors`, or None. (default None)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: List of tuples of integers representing the mean and
@@ -209,7 +212,11 @@ def build_toco_convert_protos(input_tensors,
     if inference_type == lite_constants.QUANTIZED_UINT8:
       input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     input_array.name = tensor_name(input_tensor)
-    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
+    if input_shapes is None:
+      shape = input_tensor.get_shape()
+    else:
+      shape = input_shapes[idx]
+    input_array.shape.dims.extend(map(int, shape))
 
   for output_tensor in output_tensors:
     model.output_arrays.append(tensor_name(output_tensor))
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index e1981ceae2a14fe42f4725c2fcd9f7f460770e21..3243bddac879b8eb0ca7a03d28b2f6094f905983 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import sys
+import numpy as np
 from tensorflow.python.util.lazy_loader import LazyLoader
 
 # Lazy load since some of the performance benchmark skylark rules
@@ -162,6 +163,9 @@ class Interpreter(object):
       ValueError: If the interpreter could not resize the input tensor.
     """
     self._ensure_safe()
+    # `ResizeInputTensor` now only accepts int32 numpy array as `tensor_size
+    # parameter.
+    tensor_size = np.array(tensor_size, dtype=np.int32)
     self._interpreter.ResizeInputTensor(input_index, tensor_size)
 
   def get_output_details(self):
@@ -204,7 +208,7 @@ class Interpreter(object):
     for i in range(10):
       input().fill(3.)
       interpreter.invoke()
-      print("inference %s" % output)
+      print("inference %s" % output())
 
     Notice how this function avoids making a numpy array directly. This is
     because it is important to not hold actual numpy views to the data longer
diff --git a/tensorflow/contrib/lite/python/interpreter_test.py b/tensorflow/contrib/lite/python/interpreter_test.py
index 95fa4b8584567771a7c603e035d2335d590d3f78..e77d52ca9950ec42300264bb56ebce253d4982b1 100644
--- a/tensorflow/contrib/lite/python/interpreter_test.py
+++ b/tensorflow/contrib/lite/python/interpreter_test.py
@@ -83,7 +83,7 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     test_input = np.array([[1, 2, 3, 4]], dtype=np.uint8)
     expected_output = np.array([[4, 3, 2, 1]], dtype=np.uint8)
     interpreter.resize_tensor_input(input_details[0]['index'],
-                                    np.array(test_input.shape, dtype=np.int32))
+                                    test_input.shape)
     interpreter.allocate_tensors()
     interpreter.set_tensor(input_details[0]['index'], test_input)
     interpreter.invoke()
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index f97919363bdc4900ae10c7a2b1a6e74e3d5728cc..9ab05f3068494a573ffa5b46f84be66a12d54e46 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -108,7 +108,9 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
   ImportNumpy();
 
   std::unique_ptr<tflite::Interpreter> interpreter;
-  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (tflite::InterpreterBuilder(*model, resolver)(&interpreter) != kTfLiteOk) {
+    return nullptr;
+  }
   return interpreter;
 }
 
@@ -182,13 +184,37 @@ PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
 
 }  // namespace
 
+InterpreterWrapper* InterpreterWrapper::CreateInterpreterWrapper(
+    std::unique_ptr<tflite::FlatBufferModel> model,
+    std::unique_ptr<PythonErrorReporter> error_reporter,
+    std::string* error_msg) {
+  if (!model) {
+    *error_msg = error_reporter->message();
+    return nullptr;
+  }
+
+  auto resolver = absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>();
+  auto interpreter = CreateInterpreter(model.get(), *resolver);
+  if (!interpreter) {
+    *error_msg = error_reporter->message();
+    return nullptr;
+  }
+
+  InterpreterWrapper* wrapper =
+      new InterpreterWrapper(std::move(model), std::move(error_reporter),
+                             std::move(resolver), std::move(interpreter));
+  return wrapper;
+}
+
 InterpreterWrapper::InterpreterWrapper(
     std::unique_ptr<tflite::FlatBufferModel> model,
-    std::unique_ptr<PythonErrorReporter> error_reporter)
+    std::unique_ptr<PythonErrorReporter> error_reporter,
+    std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+    std::unique_ptr<tflite::Interpreter> interpreter)
     : model_(std::move(model)),
       error_reporter_(std::move(error_reporter)),
-      resolver_(absl::make_unique<tflite::ops::builtin::BuiltinOpResolver>()),
-      interpreter_(CreateInterpreter(model_.get(), *resolver_)) {}
+      resolver_(std::move(resolver)),
+      interpreter_(std::move(interpreter)) {}
 
 InterpreterWrapper::~InterpreterWrapper() {}
 
@@ -421,11 +447,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
   std::unique_ptr<PythonErrorReporter> error_reporter(new PythonErrorReporter);
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromFile(model_path, error_reporter.get());
-  if (!model) {
-    *error_msg = error_reporter->message();
-    return nullptr;
-  }
-  return new InterpreterWrapper(std::move(model), std::move(error_reporter));
+  return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
+                                  error_msg);
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
@@ -439,11 +462,8 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
   std::unique_ptr<tflite::FlatBufferModel> model =
       tflite::FlatBufferModel::BuildFromBuffer(buf, length,
                                                error_reporter.get());
-  if (!model) {
-    *error_msg = error_reporter->message();
-    return nullptr;
-  }
-  return new InterpreterWrapper(std::move(model), std::move(error_reporter));
+  return CreateInterpreterWrapper(std::move(model), std::move(error_reporter),
+                                  error_msg);
 }
 
 PyObject* InterpreterWrapper::ResetVariableTensorsToZero() {
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 556ec7117a179ad9fd880b295c4969bab0f4a7a7..3e03751da40064c64ab646d0b976a2ff5ca9c250 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -69,14 +69,28 @@ class InterpreterWrapper {
   PyObject* tensor(PyObject* base_object, int i);
 
  private:
-  InterpreterWrapper(std::unique_ptr<tflite::FlatBufferModel> model,
-                     std::unique_ptr<PythonErrorReporter> error_reporter);
+  // Helper function to construct an `InterpreterWrapper` object.
+  // It only returns InterpreterWrapper if it can construct an `Interpreter`.
+  // Otherwise it returns `nullptr`.
+  static InterpreterWrapper* CreateInterpreterWrapper(
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<PythonErrorReporter> error_reporter,
+      std::string* error_msg);
+
+  InterpreterWrapper(
+      std::unique_ptr<tflite::FlatBufferModel> model,
+      std::unique_ptr<PythonErrorReporter> error_reporter,
+      std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver,
+      std::unique_ptr<tflite::Interpreter> interpreter);
 
   // InterpreterWrapper is not copyable or assignable. We avoid the use of
   // InterpreterWrapper() = delete here for SWIG compatibility.
   InterpreterWrapper();
   InterpreterWrapper(const InterpreterWrapper& rhs);
 
+  // The public functions which creates `InterpreterWrapper` should ensure all
+  // these member variables are initialized successfully. Otherwise it should
+  // report the error and return `nullptr`.
   const std::unique_ptr<tflite::FlatBufferModel> model_;
   const std::unique_ptr<PythonErrorReporter> error_reporter_;
   const std::unique_ptr<tflite::ops::builtin::BuiltinOpResolver> resolver_;
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 29a1487c1f468055dde85ef6c2657a50f3d2f32b..2f9b9d469a27cc8910cb61c0da14769e5ff0baf0 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -40,24 +40,23 @@ from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
 from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert import tensor_name
+from tensorflow.contrib.lite.python.convert import tensor_name as _tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
-from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model
-from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names
-from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes
+from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
+from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
+from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes as _set_tensor_shapes
 from tensorflow.contrib.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import OpHint  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
-from tensorflow.python.framework import graph_util as tf_graph_util
-from tensorflow.python.framework.importer import import_graph_def
-from tensorflow.python.ops.variables import global_variables_initializer
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-# from tensorflow.python.util.all_util import remove_undocumented
+from tensorflow.python.framework import graph_util as _tf_graph_util
+from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
+from tensorflow.python.ops.variables import global_variables_initializer as _global_variables_initializer
+from tensorflow.python.saved_model import signature_constants as _signature_constants
+from tensorflow.python.saved_model import tag_constants as _tag_constants
 
 
 class TocoConverter(object):
@@ -196,7 +195,7 @@ class TocoConverter(object):
         input_arrays or output_arrays contains an invalid tensor name.
     """
     with _session.Session() as sess:
-      sess.run(global_variables_initializer())
+      sess.run(_global_variables_initializer())
 
       # Read GraphDef from file.
       graph_def = _graph_pb2.GraphDef()
@@ -218,12 +217,12 @@ class TocoConverter(object):
           raise ValueError(
               "Unable to parse input file '{}'.".format(graph_def_file))
       sess.graph.as_default()
-      import_graph_def(graph_def, name="")
+      _import_graph_def(graph_def, name="")
 
       # Get input and output tensors.
-      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
-      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
-      set_tensor_shapes(input_tensors, input_shapes)
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
+      _set_tensor_shapes(input_tensors, input_shapes)
 
       # Check if graph is frozen.
       if not _is_frozen_graph(sess):
@@ -261,12 +260,12 @@ class TocoConverter(object):
       TocoConverter class.
     """
     if tag_set is None:
-      tag_set = set([tag_constants.SERVING])
+      tag_set = set([_tag_constants.SERVING])
     if signature_key is None:
-      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
-    result = freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
-                                output_arrays, tag_set, signature_key)
+    result = _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                                 output_arrays, tag_set, signature_key)
     return cls(
         graph_def=result[0], input_tensors=result[1], output_tensors=result[2])
 
@@ -299,15 +298,15 @@ class TocoConverter(object):
 
     # Get input and output tensors.
     if input_arrays:
-      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
     else:
       input_tensors = keras_model.inputs
 
     if output_arrays:
-      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
     else:
       output_tensors = keras_model.outputs
-    set_tensor_shapes(input_tensors, input_shapes)
+    _set_tensor_shapes(input_tensors, input_shapes)
 
     graph_def = _freeze_graph(sess, output_tensors)
     return cls(graph_def, input_tensors, output_tensors)
@@ -328,12 +327,12 @@ class TocoConverter(object):
     for tensor in self._input_tensors:
       if not tensor.get_shape():
         raise ValueError("Provide an input shape for input array '{0}'.".format(
-            tensor_name(tensor)))
+            _tensor_name(tensor)))
       shape = tensor.get_shape().as_list()
       if None in shape[1:]:
         raise ValueError(
             "None is only supported in the 1st dimension. Tensor '{0}' has "
-            "invalid shape '{1}'.".format(tensor_name(tensor), shape))
+            "invalid shape '{1}'.".format(_tensor_name(tensor), shape))
       elif shape[0] is None:
         self._set_batch_size(batch_size=1)
 
@@ -343,7 +342,7 @@ class TocoConverter(object):
       quantized_stats = []
       invalid_stats = []
       for tensor in self._input_tensors:
-        name = tensor_name(tensor)
+        name = _tensor_name(tensor)
         if name in self.quantized_input_stats:
           quantized_stats.append(self.quantized_input_stats[name])
         else:
@@ -381,7 +380,7 @@ class TocoConverter(object):
     Returns:
       List of strings.
     """
-    return [tensor_name(tensor) for tensor in self._input_tensors]
+    return [_tensor_name(tensor) for tensor in self._input_tensors]
 
   def _set_batch_size(self, batch_size):
     """Sets the first dimension of the input tensor to `batch_size`.
@@ -428,11 +427,9 @@ def _freeze_graph(sess, output_tensors):
     Frozen GraphDef.
   """
   if not _is_frozen_graph(sess):
-    sess.run(global_variables_initializer())
-    output_arrays = [tensor_name(tensor) for tensor in output_tensors]
-    return tf_graph_util.convert_variables_to_constants(sess, sess.graph_def,
-                                                        output_arrays)
+    sess.run(_global_variables_initializer())
+    output_arrays = [_tensor_name(tensor) for tensor in output_tensors]
+    return _tf_graph_util.convert_variables_to_constants(
+        sess, sess.graph_def, output_arrays)
   else:
     return sess.graph_def
-
-# remove_undocumented(__name__)
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 9bd1f4f76ee693414a8515a5bd2567001b53e2ea..d17482e60113da5bad3a76fa2ab634ae0ffb89fd 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -257,7 +257,7 @@ def run_main(_):
   parser.add_argument(
       "--input_arrays",
       type=str,
-      help="Names of the output arrays, comma-separated.")
+      help="Names of the input arrays, comma-separated.")
   parser.add_argument(
       "--input_shapes",
       type=str,
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 0434199a0889c082fdb50afc33080e35c9067a20..8ed98ddaf40d1ca4d524407458d7b65d76c3ef2c 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -165,6 +165,8 @@ enum BuiltinOperator : byte {
   REDUCE_PROD = 81,
   REDUCE_MAX = 82,
   PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
 }
 
 // Options for the builtin operators.
@@ -228,6 +230,8 @@ union BuiltinOptions {
   ArgMinOptions,
   FakeQuantOptions,
   PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -544,6 +548,13 @@ table PackOptions {
   axis:int;
 }
 
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 9b84030938d5b719108ab0c9e14223405e8930f5..4402f89b85de1df958fd32f57fae8ba9a0c6efee 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -208,6 +208,12 @@ struct FakeQuantOptionsT;
 struct PackOptions;
 struct PackOptionsT;
 
+struct LogicalOrOptions;
+struct LogicalOrOptionsT;
+
+struct OneHotOptions;
+struct OneHotOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -357,11 +363,13 @@ enum BuiltinOperator {
   BuiltinOperator_REDUCE_PROD = 81,
   BuiltinOperator_REDUCE_MAX = 82,
   BuiltinOperator_PACK = 83,
+  BuiltinOperator_LOGICAL_OR = 84,
+  BuiltinOperator_ONE_HOT = 85,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_PACK
+  BuiltinOperator_MAX = BuiltinOperator_ONE_HOT
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[83] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[85] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -445,7 +453,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[83] {
     BuiltinOperator_FAKE_QUANT,
     BuiltinOperator_REDUCE_PROD,
     BuiltinOperator_REDUCE_MAX,
-    BuiltinOperator_PACK
+    BuiltinOperator_PACK,
+    BuiltinOperator_LOGICAL_OR,
+    BuiltinOperator_ONE_HOT
   };
   return values;
 }
@@ -536,6 +546,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "REDUCE_PROD",
     "REDUCE_MAX",
     "PACK",
+    "LOGICAL_OR",
+    "ONE_HOT",
     nullptr
   };
   return names;
@@ -607,11 +619,13 @@ enum BuiltinOptions {
   BuiltinOptions_ArgMinOptions = 57,
   BuiltinOptions_FakeQuantOptions = 58,
   BuiltinOptions_PackOptions = 59,
+  BuiltinOptions_LogicalOrOptions = 60,
+  BuiltinOptions_OneHotOptions = 61,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_PackOptions
+  BuiltinOptions_MAX = BuiltinOptions_OneHotOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[60] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[62] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -672,7 +686,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[60] {
     BuiltinOptions_PowOptions,
     BuiltinOptions_ArgMinOptions,
     BuiltinOptions_FakeQuantOptions,
-    BuiltinOptions_PackOptions
+    BuiltinOptions_PackOptions,
+    BuiltinOptions_LogicalOrOptions,
+    BuiltinOptions_OneHotOptions
   };
   return values;
 }
@@ -739,6 +755,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "ArgMinOptions",
     "FakeQuantOptions",
     "PackOptions",
+    "LogicalOrOptions",
+    "OneHotOptions",
     nullptr
   };
   return names;
@@ -989,6 +1007,14 @@ template<> struct BuiltinOptionsTraits<PackOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_PackOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LogicalOrOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LogicalOrOptions;
+};
+
+template<> struct BuiltinOptionsTraits<OneHotOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_OneHotOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1492,6 +1518,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_PackOptions ?
       reinterpret_cast<const PackOptionsT *>(value) : nullptr;
   }
+  LogicalOrOptionsT *AsLogicalOrOptions() {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<LogicalOrOptionsT *>(value) : nullptr;
+  }
+  const LogicalOrOptionsT *AsLogicalOrOptions() const {
+    return type == BuiltinOptions_LogicalOrOptions ?
+      reinterpret_cast<const LogicalOrOptionsT *>(value) : nullptr;
+  }
+  OneHotOptionsT *AsOneHotOptions() {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<OneHotOptionsT *>(value) : nullptr;
+  }
+  const OneHotOptionsT *AsOneHotOptions() const {
+    return type == BuiltinOptions_OneHotOptions ?
+      reinterpret_cast<const OneHotOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -5391,6 +5433,100 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(
 
 flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBufferBuilder &_fbb, const PackOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LogicalOrOptionsT : public flatbuffers::NativeTable {
+  typedef LogicalOrOptions TableType;
+  LogicalOrOptionsT() {
+  }
+};
+
+struct LogicalOrOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LogicalOrOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  LogicalOrOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LogicalOrOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LogicalOrOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit LogicalOrOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LogicalOrOptionsBuilder &operator=(const LogicalOrOptionsBuilder &);
+  flatbuffers::Offset<LogicalOrOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LogicalOrOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LogicalOrOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OneHotOptionsT : public flatbuffers::NativeTable {
+  typedef OneHotOptions TableType;
+  int32_t axis;
+  OneHotOptionsT()
+      : axis(0) {
+  }
+};
+
+struct OneHotOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OneHotOptionsT NativeTableType;
+  enum {
+    VT_AXIS = 4
+  };
+  int32_t axis() const {
+    return GetField<int32_t>(VT_AXIS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_AXIS) &&
+           verifier.EndTable();
+  }
+  OneHotOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OneHotOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OneHotOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_axis(int32_t axis) {
+    fbb_.AddElement<int32_t>(OneHotOptions::VT_AXIS, axis, 0);
+  }
+  explicit OneHotOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OneHotOptionsBuilder &operator=(const OneHotOptionsBuilder &);
+  flatbuffers::Offset<OneHotOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OneHotOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t axis = 0) {
+  OneHotOptionsBuilder builder_(_fbb);
+  builder_.add_axis(axis);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -5701,6 +5837,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const PackOptions *builtin_options_as_PackOptions() const {
     return builtin_options_type() == BuiltinOptions_PackOptions ? static_cast<const PackOptions *>(builtin_options()) : nullptr;
   }
+  const LogicalOrOptions *builtin_options_as_LogicalOrOptions() const {
+    return builtin_options_type() == BuiltinOptions_LogicalOrOptions ? static_cast<const LogicalOrOptions *>(builtin_options()) : nullptr;
+  }
+  const OneHotOptions *builtin_options_as_OneHotOptions() const {
+    return builtin_options_type() == BuiltinOptions_OneHotOptions ? static_cast<const OneHotOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -5968,6 +6110,14 @@ template<> inline const PackOptions *Operator::builtin_options_as<PackOptions>()
   return builtin_options_as_PackOptions();
 }
 
+template<> inline const LogicalOrOptions *Operator::builtin_options_as<LogicalOrOptions>() const {
+  return builtin_options_as_LogicalOrOptions();
+}
+
+template<> inline const OneHotOptions *Operator::builtin_options_as<OneHotOptions>() const {
+  return builtin_options_as_OneHotOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8060,6 +8210,55 @@ inline flatbuffers::Offset<PackOptions> CreatePackOptions(flatbuffers::FlatBuffe
       _axis);
 }
 
+inline LogicalOrOptionsT *LogicalOrOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LogicalOrOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LogicalOrOptions::UnPackTo(LogicalOrOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> LogicalOrOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogicalOrOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogicalOrOptions> CreateLogicalOrOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogicalOrOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogicalOrOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogicalOrOptions(
+      _fbb);
+}
+
+inline OneHotOptionsT *OneHotOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new OneHotOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void OneHotOptions::UnPackTo(OneHotOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = axis(); _o->axis = _e; };
+}
+
+inline flatbuffers::Offset<OneHotOptions> OneHotOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateOneHotOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OneHotOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _axis = _o->axis;
+  return tflite::CreateOneHotOptions(
+      _fbb,
+      _axis);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -8485,6 +8684,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const PackOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -8739,6 +8946,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const PackOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -8981,6 +9196,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const PackOptionsT *>(value);
       return CreatePackOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<const LogicalOrOptionsT *>(value);
+      return CreateLogicalOrOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<const OneHotOptionsT *>(value);
+      return CreateOneHotOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -9223,6 +9446,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new PackOptionsT(*reinterpret_cast<PackOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      value = new LogicalOrOptionsT(*reinterpret_cast<LogicalOrOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      value = new OneHotOptionsT(*reinterpret_cast<OneHotOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -9525,6 +9756,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LogicalOrOptions: {
+      auto ptr = reinterpret_cast<LogicalOrOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_OneHotOptions: {
+      auto ptr = reinterpret_cast<OneHotOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/simple_memory_arena.cc b/tensorflow/contrib/lite/simple_memory_arena.cc
index 4eaf6f1bfe76efc1e6737d03d58be9bc87bb849d..cd0f1f7c17a50f6ce61fa2033e5d13580399f5cf 100644
--- a/tensorflow/contrib/lite/simple_memory_arena.cc
+++ b/tensorflow/contrib/lite/simple_memory_arena.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/simple_memory_arena.h"
 
+#include <algorithm>
 #include <cstring>
 #include <limits>
 #include <vector>
@@ -34,7 +35,7 @@ namespace tflite {
 TfLiteStatus SimpleMemoryArena::Allocate(TfLiteContext* context,
                                          size_t alignment, size_t size,
                                          ArenaAlloc* new_alloc) {
-  TF_LITE_ENSURE(context, alignment < arena_alignment_);
+  TF_LITE_ENSURE(context, alignment <= arena_alignment_);
 
   if (size == 0) {
     new_alloc->offset = 0;
diff --git a/tensorflow/contrib/lite/testdata/add.bin b/tensorflow/contrib/lite/testdata/add.bin
new file mode 100644
index 0000000000000000000000000000000000000000..aef0fe3d82c9d92dc444076d3b46e05af1923f46
Binary files /dev/null and b/tensorflow/contrib/lite/testdata/add.bin differ
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 4c37bcb3c98cf3f1e0c4b5ffa3843e9f461b7157..a788d41ba7b370cd0e84c343202f1dca090180f3 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -112,7 +112,6 @@ cc_library(
 cc_test(
     name = "message_test",
     srcs = ["message_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":message",
         "@com_google_googletest//:gtest_main",
@@ -132,7 +131,6 @@ cc_test(
     name = "split_test",
     size = "small",
     srcs = ["split_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":split",
         "@com_google_googletest//:gtest_main",
@@ -142,13 +140,13 @@ cc_test(
 cc_library(
     name = "join",
     hdrs = ["join.h"],
+    deps = ["//tensorflow/contrib/lite:string"],
 )
 
 cc_test(
     name = "join_test",
     size = "small",
     srcs = ["join_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":join",
         "@com_google_googletest//:gtest_main",
@@ -174,7 +172,6 @@ cc_test(
     srcs = ["tflite_driver_test.cc"],
     data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
     tags = [
-        "no_oss",
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
@@ -196,7 +193,6 @@ cc_library(
 cc_test(
     name = "tokenize_test",
     srcs = ["tokenize_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":tokenize",
         "@com_google_googletest//:gtest_main",
@@ -214,12 +210,15 @@ cc_library(
 cc_library(
     name = "util",
     hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string",
+    ],
 )
 
 cc_test(
     name = "test_runner_test",
     srcs = ["test_runner_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":test_runner",
         "@com_google_googletest//:gtest_main",
@@ -275,6 +274,7 @@ cc_library(
         ":join",
         ":split",
         ":tf_driver",
+        "//tensorflow/contrib/lite:string",
         "//tensorflow/core:framework",
     ],
 )
@@ -341,7 +341,7 @@ tf_cc_test(
     ],
     tags = [
         "no_cuda_on_cpu_tap",
-        "no_oss",
+        "no_oss",  # needs test data
         "tflite_not_portable",
     ],
     deps = [
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 32d04c0717898aded98390e34cc9d3c2d2e19b18..4234d0b811d9115717e62cac8447dcdd172b8a32 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -104,6 +104,8 @@ KNOWN_BUGS = {
     r"div.*int32": "72051395",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
+    # Scalar constants don't work.
+    r"constant.*shape=\[\]": "109811500",
 }
 
 
@@ -242,7 +244,9 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     value = (max_value-min_value)*np.random.random_sample(shape)+min_value
   elif dtype in (tf.int32, tf.uint8, tf.int64):
     value = np.random.randint(min_value, max_value+1, shape)
-  return value.astype(dtype)
+
+  return np.dtype(dtype).type(value) if np.isscalar(value) else value.astype(
+      dtype)
 
 
 def create_scalar_data(dtype, min_value=-100, max_value=100):
@@ -479,7 +483,7 @@ def make_zip_of_tests(zip_path,
                           else report_lib.FAILED)
         report["toco_log"] = toco_log
 
-        if FLAGS.save_graphdefs:
+        if True or FLAGS.save_graphdefs:
           archive.writestr(label + ".pbtxt",
                            text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
@@ -734,21 +738,22 @@ def make_constant_tests(zip_path):
 
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
-      "input_shape": [[1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
   }]
 
   def build_graph(parameters):
-    # Since Toco & Tflite can't have a single constant op in the entire graph,
-    # this test adds a zero tensor with a constant op tensor.
-    input1 = tf.placeholder(dtype=parameters["dtype"], name="input1",
-                            shape=parameters["input_shape"])
-    out = tf.ones(parameters["input_shape"], dtype=parameters["dtype"]) + input1
-    return [input1], [out]
+    dummy_input = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.constant(
+        create_tensor_data(parameters["dtype"], parameters["input_shape"]))
+    return [dummy_input], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input1 = np.zeros(parameters["input_shape"],
-                      dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
-    return [input1], sess.run(outputs, feed_dict={inputs[0]: input1})
+    dummy_input = np.zeros(
+        parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
+    return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -772,6 +777,11 @@ def make_binary_op_tests(zip_path, binary_operator):
       "input_shape_1": [[1, 3, 4, 3]],
       "input_shape_2": [[3]],
       "activation": [True]
+  }, {
+      "dtype": [tf.float32],
+      "input_shape_1": [[]],
+      "input_shape_2": [[]],
+      "activation": [False]
   }]
 
   def build_graph(parameters):
@@ -821,7 +831,7 @@ def make_reduce_tests(reduce_op):
         "input_dtype": [tf.float32, tf.int32, tf.int64],
         "input_shape": [[3, 2, 4]],
         "axis": [
-            None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+            0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
             [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
             [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
         ],
@@ -831,13 +841,19 @@ def make_reduce_tests(reduce_op):
         "input_dtype": [tf.float32],
         "input_shape": [[1, 8, 8, 3]],
         "axis": [
-            None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
+            0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
             [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
             -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
             [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
         ],
         "const_axis": [True, False],
         "keepdims": [True, False],
+    }, {
+        "input_dtype": [tf.float32],
+        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+        "axis": [None],
+        "const_axis": [True],
+        "keepdims": [True, False],
     }]
 
     def build_graph(parameters):
@@ -855,7 +871,7 @@ def make_reduce_tests(reduce_op):
         if isinstance(parameters["axis"], list):
           shape = [len(parameters["axis"])]
         else:
-          shape = [0]  # shape for None or integers.
+          shape = []  # shape for None or integers.
         axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
         input_tensors = [input_tensor, axis]
 
@@ -866,10 +882,11 @@ def make_reduce_tests(reduce_op):
     def build_inputs(parameters, sess, inputs, outputs):
       values = [
           create_tensor_data(parameters["input_dtype"],
-                             parameters["input_shape"])]
+                             parameters["input_shape"],
+                             min_value=-10,
+                             max_value=10)]
       if not parameters["const_axis"]:
-        if parameters["axis"]:
-          values.append(np.array(parameters["axis"]))
+        values.append(np.array(parameters["axis"]))
       return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
     make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
@@ -902,7 +919,7 @@ def make_exp_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -961,8 +978,8 @@ def make_maximum_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
-      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_1": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -996,8 +1013,8 @@ def make_minimum_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32],
-      "input_shape_1": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
-      "input_shape_2": [[3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_1": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
+      "input_shape_2": [[], [3], [1, 100], [4, 2, 3], [5, 224, 224, 3]],
   }]
 
   def build_graph(parameters):
@@ -1595,19 +1612,34 @@ def make_reshape_tests(zip_path):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[3, 4, 5, 7], [4, 105], [21, 5, 2, 2], [420]],
       "output_shape": [[15, 28], [420], [1, -1, 5, 7], [-1]],
+      "constant_shape": [True, False],
   }]
 
   def build_graph(parameters):
     input_tensor = tf.placeholder(dtype=parameters["dtype"], name="input",
                                   shape=parameters["input_shape"])
-    out = tf.reshape(input_tensor, shape=parameters["output_shape"])
-    return [input_tensor], [out]
+
+    # Get shape as either a placeholder or constants.
+    if parameters["constant_shape"]:
+      output_shape = parameters["output_shape"]
+      input_tensors = [input_tensor]
+    else:
+      # The shape of the shape tensor.
+      shape_tensor_shape = [len(parameters["output_shape"])]
+      output_shape = tf.placeholder(
+          dtype=tf.int32, name="output_shape", shape=shape_tensor_shape)
+      input_tensors = [input_tensor, output_shape]
+    out = tf.reshape(input_tensor, shape=output_shape)
+    return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    input_values = create_tensor_data(parameters["dtype"],
-                                      parameters["input_shape"])
-    return [input_values], sess.run(
-        outputs, feed_dict=dict(zip(inputs, [input_values])))
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_shape"]:
+      values.append(np.array(parameters["output_shape"]))
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
@@ -1638,6 +1670,65 @@ def make_shape_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_one_hot_tests(zip_path):
+  """Make a set of tests to do one_hot."""
+
+  test_parameters = [{
+      "indices_type": [tf.int32, tf.int64],
+      "indices_shape": [[3], [4, 4], [1, 5], [5, 1]],
+      "axis": [0, 1],
+      "dtype": [tf.int32, tf.int64, tf.float32],
+      "provide_optional_inputs": [True, False],
+  }]
+
+  def build_graph(parameters):
+    indices = tf.placeholder(
+        dtype=parameters["indices_type"],
+        name="indices",
+        shape=parameters["indices_shape"])
+    depth = tf.placeholder(dtype=tf.int32, name="depth", shape=())
+
+    if not parameters["provide_optional_inputs"]:
+      out = tf.one_hot(indices=indices, depth=depth)
+      return [indices, depth], [out]
+
+    on_value = tf.placeholder(
+        dtype=parameters["dtype"], name="on_value", shape=())
+    off_value = tf.placeholder(
+        dtype=parameters["dtype"], name="off_value", shape=())
+    out = tf.one_hot(
+        indices=indices,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+        axis=parameters["axis"],
+        dtype=parameters["dtype"])
+    return [indices, depth, on_value, off_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [
+        create_tensor_data(
+            parameters["indices_type"],
+            shape=parameters["indices_shape"],
+            min_value=-1,
+            max_value=10),
+        create_tensor_data(tf.int32, shape=None, min_value=1, max_value=10),
+    ]
+
+    if parameters["provide_optional_inputs"]:
+      input_values.append(
+          create_tensor_data(
+              parameters["dtype"], shape=None, min_value=1, max_value=10))
+      input_values.append(
+          create_tensor_data(
+              parameters["dtype"], shape=None, min_value=-1, max_value=0))
+
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_resize_bilinear_tests(zip_path):
   """Make a set of tests to do resize_bilinear."""
 
@@ -2237,7 +2328,7 @@ def make_arg_min_max_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
-      "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
+      "input_shape": [[], [1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
       "output_type": [tf.int32, tf.int64],
       "axis_is_last_dim": [True, False],
       "is_arg_max": [True],
@@ -2273,7 +2364,8 @@ def make_equal_tests(zip_path):
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+      "input_shape_pair": [([], []),
+                           ([1, 1, 1, 3], [1, 1, 1, 3]),
                            ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
                            ([5, 5], [1]), ([10], [2, 4, 10])],
   }]
@@ -2530,7 +2622,7 @@ def _make_elementwise_tests(op):
     """Actual function that generates examples."""
     test_parameters = [{
         "input_dtype": [tf.float32],
-        "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
     }]
 
     def build_graph(parameters):
@@ -2852,6 +2944,44 @@ def make_sparse_to_dense_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_pack_tests(zip_path):
+  """Make a set of tests to do stack."""
+
+  test_parameters = [{
+      "base_shape": [[3, 4, 3], [3, 4], [5]],
+      "num_tensors": [1, 2, 3, 4, 5, 6],
+      "axis": [0, 1, 2, 3],
+      "additional_shape": [1, 2, 3],
+  }]
+
+  def get_shape(parameters):
+    """Return a tweaked version of 'base_shape'."""
+    axis = parameters["axis"]
+    shape = parameters["base_shape"][:]
+    if axis < len(shape):
+      shape[axis] += parameters["additional_shape"]
+    return shape
+
+  def build_graph(parameters):
+    all_tensors = []
+    for n in range(0, parameters["num_tensors"]):
+      input_tensor = tf.placeholder(
+          dtype=tf.float32, name=("input%d" % n), shape=get_shape(parameters))
+      all_tensors.append(input_tensor)
+    out = tf.stack(all_tensors, parameters["axis"])
+    return all_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    all_values = []
+    for _ in range(0, parameters["num_tensors"]):
+      input_values = create_tensor_data(np.float32, get_shape(parameters))
+      all_values.append(input_values)
+    return all_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, all_values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index c1092e4d25567f0374e3cd5a27bde32419d3db19..f29c188e6c2c55bdb13d257c70e23c2943abfa4a 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iostream>
+
 #include "tensorflow/contrib/lite/testing/generate_testspec.h"
 #include "tensorflow/contrib/lite/testing/join.h"
 #include "tensorflow/contrib/lite/testing/split.h"
@@ -88,13 +90,13 @@ bool GenerateTestSpecFromTensorflowModel(
   TfDriver runner(input_layer, input_layer_type, input_layer_shape,
                   output_layer);
   if (!runner.IsValid()) {
-    cerr << runner.GetErrorMessage() << endl;
+    std::cerr << runner.GetErrorMessage() << std::endl;
     return false;
   }
 
   runner.LoadModel(tensorflow_model_path);
   if (!runner.IsValid()) {
-    cerr << runner.GetErrorMessage() << endl;
+    std::cerr << runner.GetErrorMessage() << std::endl;
     return false;
   }
 
@@ -118,14 +120,14 @@ bool GenerateTestSpecFromTensorflowModel(
     for (int j = 0; j < input_values.size(); j++) {
       runner.SetInput(j, input_values[j]);
       if (!runner.IsValid()) {
-        cerr << runner.GetErrorMessage() << endl;
+        std::cerr << runner.GetErrorMessage() << std::endl;
         return false;
       }
     }
 
     runner.Invoke();
     if (!runner.IsValid()) {
-      cerr << runner.GetErrorMessage() << endl;
+      std::cerr << runner.GetErrorMessage() << std::endl;
       return false;
     }
 
@@ -137,7 +139,7 @@ bool GenerateTestSpecFromTensorflowModel(
     for (int j = 0; j < output_layer.size(); j++) {
       stream << "  output: \"" << runner.ReadOutput(j) << "\"\n";
       if (!runner.IsValid()) {
-        cerr << runner.GetErrorMessage() << endl;
+        std::cerr << runner.GetErrorMessage() << std::endl;
         return false;
       }
     }
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.h b/tensorflow/contrib/lite/testing/generate_testspec.h
index bfaf5e7ec89bbdd85b68a7dc45d7686e143e5d3d..b3d0db31c01a8cb1b8f34ff6dbb00c77de29b131 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.h
+++ b/tensorflow/contrib/lite/testing/generate_testspec.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
+#include "tensorflow/contrib/lite/string.h"
+
 namespace tflite {
 namespace testing {
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 770092e12c537c170b34536a51a9d0fbf9e1bf6b..106cbc1b8e1d289ec04721611294c6a4c79dabb4 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -226,7 +226,8 @@ TEST_P(OpsTest, RunZipTests) {
   string message = test_driver.GetErrorMessage();
   if (bug_number.empty()) {
     if (FLAGS_use_nnapi && FLAGS_ignore_unsupported_nnapi && !result) {
-      EXPECT_EQ(message, string("Failed to invoke interpreter")) << message;
+      EXPECT_EQ(message, string("Failed to invoke NNAPI interpreter"))
+          << message;
     } else {
       EXPECT_TRUE(result) << message;
     }
diff --git a/tensorflow/contrib/lite/testing/join.h b/tensorflow/contrib/lite/testing/join.h
index 1edee01cf97da3c53be1895e667b005551ac2991..4be19ad7569c3333b6647b91adbc6e77ff088f10 100644
--- a/tensorflow/contrib/lite/testing/join.h
+++ b/tensorflow/contrib/lite/testing/join.h
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <cstdlib>
 #include <sstream>
-#include <string>
+
+#include "tensorflow/contrib/lite/string.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index 96ab6be54e528334f9e4a8cc259e44f99878fefb..fac7d01aab4b1e4c251213041eb4b823cd7d66aa 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -90,7 +90,7 @@ class TestRunner {
 
   // Invalidate the test runner, preventing it from executing any further.
   void Invalidate(const string& error_message) {
-    cerr << error_message << std::endl;
+    std::cerr << error_message << std::endl;
     error_message_ = error_message;
   }
   bool IsValid() const { return error_message_.empty(); }
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 3b27f6f3da92ce80c3830feb7c6af095e7c48e9c..ec435ca60d959a11a9392b6fbab99b0561f50942 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -28,8 +28,8 @@ namespace {
 
 tensorflow::Tensor CreateTensor(const tensorflow::DataType type,
                                 const std::vector<int64_t>& dim) {
-  tensorflow::TensorShape shape{gtl::ArraySlice<int64>{
-      reinterpret_cast<const int64*>(dim.data()), dim.size()}};
+  tensorflow::TensorShape shape{tensorflow::gtl::ArraySlice<tensorflow::int64>{
+      reinterpret_cast<const tensorflow::int64*>(dim.data()), dim.size()}};
   return {type, shape};
 }
 
@@ -179,7 +179,7 @@ void TfDriver::Invoke() {
   auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
                               output_names_, {}, &output_tensors_);
   if (!status.ok()) {
-    Invalidate("Failed to invoke interpreter");
+    Invalidate("Failed to run input data on graph");
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_diff_flags.h b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
index 7a57e8d3fba29cd106eb038992bb5ed12bb457ae..695c2a3de6c5d7c74a943134f0c97390710ef1e7 100644
--- a/tensorflow/contrib/lite/testing/tflite_diff_flags.h
+++ b/tensorflow/contrib/lite/testing/tflite_diff_flags.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_TFLITE_DIFF_FLAGS_H_
 
+#include <cstring>
+
 #include "tensorflow/contrib/lite/testing/split.h"
 #include "tensorflow/contrib/lite/testing/tflite_diff_util.h"
 #include "tensorflow/core/util/command_line_flags.h"
diff --git a/tensorflow/contrib/lite/testing/util.h b/tensorflow/contrib/lite/testing/util.h
index 6d20aec141c7c3a3e48af290edb169c6fd7254cf..8aa639157b8b68061f9ee8c3483959a79cb5794e 100644
--- a/tensorflow/contrib/lite/testing/util.h
+++ b/tensorflow/contrib/lite/testing/util.h
@@ -15,8 +15,39 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_UTIL_H_
 
+#include <cstdio>
+
+#include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/string.h"
+
 namespace tflite {
 
+// An ErrorReporter that collects error message in a string, in addition
+// to printing to stderr.
+class TestErrorReporter : public ErrorReporter {
+ public:
+  int Report(const char* format, va_list args) override {
+    char buffer[1024];
+    int size = vsnprintf(buffer, sizeof(buffer), format, args);
+    fprintf(stderr, "%s", buffer);
+    error_messages_ += buffer;
+    num_calls_++;
+    return size;
+  }
+
+  void Reset() {
+    num_calls_ = 0;
+    error_messages_.clear();
+  }
+
+  int num_calls() const { return num_calls_; }
+  const string& error_messages() const { return error_messages_; }
+
+ private:
+  int num_calls_ = 0;
+  string error_messages_;
+};
+
 inline void LogToStderr() {
 #ifdef PLATFORM_GOOGLE
   FLAGS_logtostderr = true;
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index f9a6d31d6048f22d7650ada1d5313fc3824aef8a..9983e5991082b320b5a213a51e9e746c0476b551 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1257,8 +1257,6 @@ void ConvertPackOperator(const Model& model, const PackOperator& src_op,
   for (const auto& input : src_op.inputs) {
     *pack_op->add_input() = input;
   }
-  (*pack_op->mutable_attr())["elem_type"].set_type(
-      GetTensorFlowDataType(model, src_op.outputs[0]));
   (*pack_op->mutable_attr())["axis"].set_i(src_op.axis);
   (*pack_op->mutable_attr())["N"].set_i(src_op.inputs.size());
   (*pack_op->mutable_attr())["T"].set_type(GetTensorFlowDataType(src_op.dtype));
@@ -1318,6 +1316,20 @@ void ConvertResizeBilinearOperator(const Model& model,
   (*resize_op->mutable_attr())["align_corners"].set_b(src_op.align_corners);
 }
 
+void ConvertOneHotOperator(const Model& model, const OneHotOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* onehot_op = tensorflow_graph->add_node();
+  onehot_op->set_op("OneHot");
+  onehot_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 4);
+  for (const auto& input : src_op.inputs) {
+    *onehot_op->add_input() = input;
+  }
+  (*onehot_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.outputs[0]));
+  (*onehot_op->mutable_attr())["axis"].set_i(src_op.axis);
+}
+
 namespace {
 // TODO(aselle): Remove when available in absl
 absl::string_view FindLongestCommonPrefix(absl::string_view a,
@@ -2160,6 +2172,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertLogicalNotOperator(model,
                               static_cast<const LogicalNotOperator&>(src_op),
                               tensorflow_graph);
+  } else if (src_op.type == OperatorType::kOneHot) {
+    ConvertOneHotOperator(model, static_cast<const OneHotOperator&>(src_op),
+                          tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 18b7848db86e553ec645fa87298420012b5f753f..4bf47aa3c4d1b682808ab8175c4d07d8a347067a 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -36,7 +36,7 @@ There are two approaches to running TOCO via command line.
 *   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
     `tflite_convert` will be installed as part of the Python package. All of the
     examples below use `tflite_convert` for simplicity.
-    *   Example: `tflite --output_file=...`
+    *   Example: `tflite_convert --output_file=...`
 *   `bazel`: In order to run the latest version of TOCO, [clone the TensorFlow
     repository](https://www.tensorflow.org/install/install_sources#clone_the_tensorflow_repository)
     and use `bazel`. This is the recommended approach for converting models that
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
index 75642bbc37be6b3140e5b79a463ca70b5786d772..c13fc0de7502a9edc80dc399354708a5b1b96b02 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/ensure_uint8_weights_safe_for_fast_int8_kernels.cc
@@ -181,7 +181,7 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
         // future without worrying.
         static constexpr int kMinDistanceBetweenBadValues = 16;
         if (distance < kMinDistanceBetweenBadValues) {
-          if (allow_nudging_weights()) {
+          if (allow_nudging_weights() || has_default_ranges_flag()) {
             buffer_data[i] = 1;
             changed = true;
             continue;
@@ -200,6 +200,15 @@ bool EnsureUint8WeightsSafeForFastInt8Kernels::Run(Model* model,
   }
 
   if (changed) {
+    if (has_default_ranges_flag()) {
+      std::cerr
+          << "Since the specified values of --default_ranges_min and "
+             "--default_ranges_max result in values incompatible with TFLite's "
+             "fast int8 kernels, "
+             "--allow_nudging_weights_to_use_fast_gemm_kernel "
+             "has been enabled. This may affect the accuracy of the model."
+          << std::endl;
+    }
     AddMessageF("Tweaked weights values for %s", LogName(op));
   }
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index b7634e28c6a1e509d2b68b9f514f86a26c233f5d..8d9a4c4700e12ac1a187038a0a5efc1b033d4e57 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -262,8 +262,12 @@ class EnsureUint8WeightsSafeForFastInt8Kernels : public GraphTransformation {
   bool allow_nudging_weights() const { return allow_nudging_weights_; }
   void set_allow_nudging_weights(bool val) { allow_nudging_weights_ = val; }
 
+  bool has_default_ranges_flag() const { return has_default_ranges_flag_; }
+  void set_has_default_ranges_flag(bool val) { has_default_ranges_flag_ = val; }
+
  private:
   bool allow_nudging_weights_ = false;
+  bool has_default_ranges_flag_ = false;
 };
 
 #undef DECLARE_GRAPH_TRANSFORMATION
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 3dda536ef7f244a1a59618a4fb293dcfde33246d..0f94006f34ecbfa418365fa93a940a4cacde31e7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -154,8 +154,8 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
         return false;
       }
       for (int i = 0; i < op->outputs.size(); ++i) {
-        auto output = op->outputs[i];
-        auto data_type = unsupported_op->output_data_types[i];
+        const string& output = op->outputs[i];
+        const ArrayDataType data_type = unsupported_op->output_data_types[i];
         model->GetArray(output).data_type = data_type;
       }
       break;
@@ -193,6 +193,26 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kPack: {
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      for (const auto& input : op->inputs) {
+        CHECK(data_type == model->GetArray(input).data_type);
+      }
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
+    case OperatorType::kOneHot: {
+      CHECK_EQ(op->inputs.size(), 4);
+      CHECK_EQ(op->outputs.size(), 1);
+      const ArrayDataType on_value_type =
+          model->GetArray(op->inputs[OneHotOperator::ON_VALUE_INPUT]).data_type;
+      const ArrayDataType off_value_type =
+          model->GetArray(op->inputs[OneHotOperator::OFF_VALUE_INPUT])
+              .data_type;
+      CHECK(on_value_type == off_value_type);
+      model->GetArray(op->outputs[0]).data_type = on_value_type;
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 62ed5c46e99c561764e853c0419f2046c8086dba..5aa0fddf571dbbf2bd184b100f9efe0bdabf5b43 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1578,6 +1578,61 @@ void ProcessAnyOperator(Model* model, AnyOperator* op) {
   }
 }
 
+void ProcessOneHotOperator(Model* model, OneHotOperator* op) {
+  CHECK_EQ(op->inputs.size(), 4);
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) {
+    // Shape already propagated
+    return;
+  }
+
+  // Yield until indices dims have been resolved.
+  const auto& indices_array =
+      model->GetArray(op->inputs[OneHotOperator::INDICES_INPUT]);
+  if (!indices_array.has_shape()) {
+    return;
+  }
+
+  // Yield until depth is constant and dims have been resolved.
+  if (!IsConstantParameterArray(*model,
+                                op->inputs[OneHotOperator::DEPTH_INPUT])) {
+    return;
+  }
+  const auto& depth_array =
+      model->GetArray(op->inputs[OneHotOperator::DEPTH_INPUT]);
+  if (!depth_array.has_shape()) {
+    return;
+  }
+
+  CHECK(depth_array.data_type == ArrayDataType::kInt32)
+      << "Depth array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(depth_array.shape()), 1)
+      << "Depth array must be scalar.";
+
+  const int depth = depth_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  CHECK_GE(depth, 0) << "Depth must be non-negative.";
+
+  const int indices_dims = indices_array.shape().dimensions_count();
+  const int output_dims = indices_dims + 1;
+  const int axis = op->axis == -1 ? indices_dims : op->axis;
+  CHECK_GE(axis, 0) << "Resolved axis must be non-negative.";
+
+  auto* mutable_dims = output_array.mutable_shape()->mutable_dims();
+  mutable_dims->resize(output_dims);
+  for (int i = 0; i < output_dims; ++i) {
+    int dim = 0;
+    if (i < axis) {
+      dim = indices_array.shape().dims(i);
+    } else if (i == axis) {
+      dim = depth;
+    } else {
+      dim = indices_array.shape().dims(i - 1);
+    }
+    (*mutable_dims)[i] = dim;
+  }
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -1786,8 +1841,19 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       ProcessArgMinMaxOperator<ArgMinOperator>(
           model, static_cast<ArgMinOperator*>(op));
       break;
-    case OperatorType::kUnsupported:
+    case OperatorType::kUnsupported: {
+      const auto* unsupported_op =
+          static_cast<TensorFlowUnsupportedOperator*>(op);
+      // Attribute can be not specified, ignore it.
+      if (unsupported_op->output_shapes.size() < op->outputs.size()) {
+        return false;
+      }
+      for (int i = 0; i < op->outputs.size(); ++i) {
+        const string& output = op->outputs[i];
+        model->GetArray(output).copy_shape(unsupported_op->output_shapes.at(i));
+      }
       break;
+    }
     case OperatorType::kSvdf:
       ProcessSvdfOperator(model, static_cast<SvdfOperator*>(op));
       break;
@@ -1814,6 +1880,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kAny:
       ProcessAnyOperator(model, static_cast<AnyOperator*>(op));
       break;
+    case OperatorType::kOneHot:
+      ProcessOneHotOperator(model, static_cast<OneHotOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 5be275747906fc8b30194f2e3d062db45fbb088f..f6ce3b3ecb2cc06708287804bf34aa152d668f8c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -50,6 +50,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kPadV2 || type == OperatorType::kReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kBatchToSpaceND ||
          type == OperatorType::kSpaceToBatchND ||
          type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kStridedSlice ||
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 8bb797fe0f09a40c0c08df3557d33bd042b965dc..7e593029e5fd0445a345385f69f249bcf9dde491 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -215,7 +215,7 @@ tensorflow::Status ImportFloatArray(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -253,7 +253,7 @@ tensorflow::Status ImportQuint8Array(const TensorProto& input_tensor,
                                      Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_QUINT8);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -290,7 +290,7 @@ tensorflow::Status ImportInt32Array(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT32);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -326,7 +326,7 @@ tensorflow::Status ImportInt64Array(const TensorProto& input_tensor,
                                     Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT64);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -363,7 +363,7 @@ tensorflow::Status ImportBoolArray(const TensorProto& input_tensor,
                                    Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_BOOL);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -409,7 +409,7 @@ tensorflow::Status ImportStringArray(const TensorProto& input_tensor,
                                      Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_STRING);
   const auto& input_shape = input_tensor.tensor_shape();
-  CHECK_LE(input_shape.dim_size(), 4);
+  CHECK_LE(input_shape.dim_size(), 6);
   int input_flat_size;
   auto status = ImportShape(input_shape.dim(), &input_flat_size,
                             output_array->mutable_shape());
@@ -1045,6 +1045,11 @@ tensorflow::Status ConvertSimpleOperator(
 tensorflow::Status ConvertUnsupportedOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
+  // Names of special attributes in TF graph that are used by Toco.
+  static constexpr char kAttrOutputQuantized[] = "_output_quantized";
+  static constexpr char kAttrOutputTypes[] = "_output_types";
+  static constexpr char kAttrOutputShapes[] = "_output_shapes";
+
   LOG(INFO) << "Converting unsupported operation: " << node.op();
   auto* op = new TensorFlowUnsupportedOperator;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
@@ -1055,11 +1060,11 @@ tensorflow::Status ConvertUnsupportedOperator(
   op->tensorflow_op = node.op();
   node.SerializeToString(&op->tensorflow_node_def);
   model->operators.emplace_back(op);
-  if (HasAttr(node, "_output_quantized")) {
-    op->quantized = GetBoolAttr(node, "_output_quantized");
+  if (HasAttr(node, kAttrOutputQuantized)) {
+    op->quantized = GetBoolAttr(node, kAttrOutputQuantized);
   }
-  if (HasAttr(node, "_output_types")) {
-    const auto& output_types = GetListAttr(node, "_output_types");
+  if (HasAttr(node, kAttrOutputTypes)) {
+    const auto& output_types = GetListAttr(node, kAttrOutputTypes);
     for (int i = 0; i < output_types.type_size(); ++i) {
       op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
     }
@@ -1067,6 +1072,19 @@ tensorflow::Status ConvertUnsupportedOperator(
     const auto& output_type = GetDataTypeAttr(node, "Tout");
     op->output_data_types.push_back(ConvertDataType(output_type));
   }
+  if (HasAttr(node, kAttrOutputShapes)) {
+    const auto& output_shapes = GetListAttr(node, kAttrOutputShapes);
+    Shape output_shape;
+    for (int i = 0; i < output_shapes.shape_size(); ++i) {
+      const auto status =
+          ImportShape(output_shapes.shape(i).dim(), /*input_flat_size=*/nullptr,
+                      &output_shape);
+      if (!status.ok()) {
+        return status;
+      }
+      op->output_shapes.push_back(output_shape);
+    }
+  }
   return tensorflow::Status::OK();
 }
 
@@ -1529,6 +1547,7 @@ tensorflow::Status ConvertPackOperator(
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
   }
+  op->values_count = HasAttr(node, "N") ? GetIntAttr(node, "N") : num_inputs;
   op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : 0;
   op->dtype = ConvertDataType(toco::GetDataTypeAttr(node, "T"));
   op->outputs.push_back(node.name());
@@ -1814,6 +1833,27 @@ tensorflow::Status ConvertSparseToDenseOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertOneHotOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "OneHot");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 4));
+
+  const auto dtype = GetDataTypeAttr(node, "T");
+  // TODO(b/111744875): Support DT_UINT8 and quantization.
+  CHECK(dtype == DT_INT32 || dtype == DT_INT64 || dtype == DT_FLOAT ||
+        dtype == DT_BOOL);
+
+  auto op = absl::make_unique<OneHotOperator>();
+  op->axis = HasAttr(node, "axis") ? GetIntAttr(node, "axis") : -1;
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  op->outputs.push_back(node.name());
+  model->operators.emplace_back(op.release());
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -1890,6 +1930,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"NextIteration", ConvertOperatorSpecialCasedAsRNNBackEdge},
       {"NoOp", ConvertNoOpOperator},
       {"NotEqual", ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>},
+      {"OneHot", ConvertOneHotOperator},
       {"Pack", ConvertPackOperator},
       {"Pad", ConvertSimpleOperator<PadOperator, 2>},
       {"PadV2", ConvertSimpleOperator<PadV2Operator, 3>},
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 6fe194516d4cc5c6363829ae43140d2fdaa4693b..a3827977fdd76af86b44e37cad8583fe2d589d9c 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -64,6 +64,7 @@ enum class OperatorType : uint8 {
   kMaxPool,
   kFakeQuant,
   kMul,
+  kOneHot,
   kRandomUniform,
   kRange,
   kRank,
@@ -292,6 +293,46 @@ struct Buffer : GenericBuffer {
   std::vector<DataType<A>> data;
 };
 
+class Shape {
+ public:
+  // For Shape, we stick to half-way encapsulation for now:
+  // we hide the raw dims_ member, but expose it raw by accessors
+  // because from some brainstorming, it's not at all easy to
+  // anticipate which flavor of more hermetic encapsulation would
+  // actually buy us future-proof-ness without being needlessly
+  // cumbersome.
+  Shape() {}
+  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
+
+  void ReplaceDims(std::initializer_list<int> dim_list) {
+    dims_ = std::vector<int>(dim_list);
+  }
+
+  const std::vector<int>& dims() const { return dims_; }
+  std::vector<int>* mutable_dims() { return &dims_; }
+  const int dimensions_count() const { return dims_.size(); }
+
+  // We still have that one convenience accessor to avoid
+  // the awkward double bracket issue:  shape.dims()[i].
+  int dims(int i) const {
+    // Always check for out-of-bounds accesses, even in optimized builds where
+    // standard assertions are disabled. Out-of-bounds access here is a common
+    // occurrence.
+    CHECK_GE(i, 0);
+    CHECK_GT(dims_.size(), i);
+    return dims_[i];
+  }
+
+  bool operator==(const Shape& comp) const {
+    return (this->dims_ == comp.dims());
+  }
+
+  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
+
+ private:
+  std::vector<int> dims_;
+};
+
 // Base class for all operator classes.
 struct Operator {
   // Non-default-constructible: only OperatorType-specific subclass
@@ -1164,6 +1205,7 @@ struct TensorFlowRsqrtOperator : Operator {
 // TensorFlow equivalent: Pack
 struct PackOperator : Operator {
   PackOperator() : Operator(OperatorType::kPack) {}
+  int values_count;
   int axis = 0;
   ArrayDataType dtype = ArrayDataType::kNone;
 };
@@ -1468,6 +1510,8 @@ struct TensorFlowUnsupportedOperator : Operator {
   bool quantized = false;
   // Output data types
   std::vector<ArrayDataType> output_data_types;
+  // Output shapes.
+  std::vector<Shape> output_shapes;
 };
 
 // Softmax activation function.
@@ -1725,6 +1769,27 @@ struct LogicalNotOperator : Operator {
   LogicalNotOperator() : Operator(OperatorType::kLogicalNot) {}
 };
 
+// OneHot operator:
+//
+// Inputs:
+// Inputs[0]: required: indices.
+// Inputs[1]: required: depth.
+// Inputs[2]: required: on_value.
+// Inputs[3]: required: off_value.
+//
+// TensorFlow equivalent: OneHot.
+struct OneHotOperator : Operator {
+  enum Inputs {
+    INDICES_INPUT = 0,
+    DEPTH_INPUT = 1,
+    ON_VALUE_INPUT = 2,
+    OFF_VALUE_INPUT = 3,
+  };
+
+  OneHotOperator() : Operator(OperatorType::kOneHot) {}
+  int axis = -1;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
@@ -1738,46 +1803,6 @@ inline bool operator<(const Alloc& a, const Alloc& b) {
   return a.start < b.start;
 }
 
-class Shape {
- public:
-  // For Shape, we stick to half-way encapsulation for now:
-  // we hide the raw dims_ member, but expose it raw by accessors
-  // because from some brainstorming, it's not at all easy to
-  // anticipate which flavor of more hermetic encapsulation would
-  // actually buy us future-proof-ness without being needlessly
-  // cumbersome.
-  Shape() {}
-  Shape(std::initializer_list<int> dim_list) : dims_(dim_list) {}
-
-  void ReplaceDims(std::initializer_list<int> dim_list) {
-    dims_ = std::vector<int>(dim_list);
-  }
-
-  const std::vector<int>& dims() const { return dims_; }
-  std::vector<int>* mutable_dims() { return &dims_; }
-  const int dimensions_count() const { return dims_.size(); }
-
-  // We still have that one convenience accessor to avoid
-  // the awkward double bracket issue:  shape.dims()[i].
-  int dims(int i) const {
-    // Always check for out-of-bounds accesses, even in optimized builds where
-    // standard assertions are disabled. Out-of-bounds access here is a common
-    // occurrence.
-    CHECK_GE(i, 0);
-    CHECK_GT(dims_.size(), i);
-    return dims_[i];
-  }
-
-  bool operator==(const Shape& comp) const {
-    return (this->dims_ == comp.dims());
-  }
-
-  bool operator!=(const Shape& comp) const { return !((*this) == comp); }
-
- private:
-  std::vector<int> dims_;
-};
-
 // Array represents an array (either a constant parameter array or an
 // activations array) in a Model.
 struct Array {
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 06072d1fcb0612ed8193b3a0be1317923fe95bcc..d34da63e43eee3b48e575c33ddb6c89f7701865c 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -322,6 +322,10 @@ void ReadModelFlagsFromCommandLineFlags(
     for (int i = 0; i < input_shapes.size(); ++i) {
       auto* shape = model_flags->mutable_input_arrays(i)->mutable_shape();
       shape->clear_dims();
+      // Treat an empty input shape as a scalar.
+      if (input_shapes[i].empty()) {
+        continue;
+      }
       for (const auto& dim_str : absl::StrSplit(input_shapes[i], ',')) {
         int size;
         CHECK(absl::SimpleAtoi(dim_str, &size))
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index d1fdbcb8e9131e1d65fa32ca0395bbc17b2014e7..a95937ba0f4f66fedfab6c1528c8dc4e417297d0 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -262,7 +262,7 @@ TEST_F(VersionedOpExportTest, Export) {
   EXPECT_EQ(1, (*operators)[1]->opcode_index());
 }
 
-// TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
+// TODO(ahentz): tests for tensors, inputs, outputs, opcodes and operators.
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 1a1c4b8944965b667ec283584bb8e34c0cabfb5b..769e350ea9340c934beebd643d03a6dbc4242fa4 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1013,6 +1013,26 @@ class ExpandDims
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class Pack : public BuiltinOperator<PackOperator, ::tflite::PackOptions,
+                                    ::tflite::BuiltinOptions_PackOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreatePackOptions(*builder, op.values_count, op.axis);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->values_count = options.values_count();
+    op->axis = options.axis();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class Shape
     : public BuiltinOperator<TensorFlowShapeOperator, ::tflite::ShapeOptions,
                              ::tflite::BuiltinOptions_ShapeOptions> {
@@ -1033,6 +1053,23 @@ class Shape
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class OneHot : public BuiltinOperator<OneHotOperator, ::tflite::OneHotOptions,
+                                      ::tflite::BuiltinOptions_OneHotOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateOneHotOptions(*builder, op.axis);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->axis = options.axis();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -1256,6 +1293,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Shape(::tflite::BuiltinOperator_SHAPE, OperatorType::kShape));
   ops.emplace_back(new FakeQuant(::tflite::BuiltinOperator_FAKE_QUANT,
                                  OperatorType::kFakeQuant));
+  ops.emplace_back(
+      new Pack(::tflite::BuiltinOperator_PACK, OperatorType::kPack));
+  ops.emplace_back(
+      new OneHot(::tflite::BuiltinOperator_ONE_HOT, OperatorType::kOneHot));
 
   // Custom Operators.
   ops.emplace_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index ff2d35b1f5c6613e01e665cb616b85f281ba800e..7e1e32ae54731a0ffe72fbdbfd4cedc72b655c7f 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -452,6 +452,24 @@ TEST_F(OperatorTest, BuiltinSparseToDense) {
   EXPECT_EQ(op.validate_indices, output_toco_op->validate_indices);
 }
 
+TEST_F(OperatorTest, BuiltinPack) {
+  PackOperator op;
+  op.values_count = 3;
+  op.axis = 1;
+  std::unique_ptr<toco::PackOperator> output_toco_op =
+      SerializeAndDeserialize(GetOperator("PACK", OperatorType::kPack), op);
+  EXPECT_EQ(op.values_count, output_toco_op->values_count);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
+}
+
+TEST_F(OperatorTest, BuiltinOneHot) {
+  OneHotOperator op;
+  op.axis = 2;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("ONE_HOT", OperatorType::kOneHot), op);
+  EXPECT_EQ(op.axis, output_toco_op->axis);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index aa7f6996eb8a166184cac04ff818c66fcdc34703..fcd3cbab07c06737f43d822e5b16f7c188f56b1a 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -309,8 +309,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     // HardcodeMinMax to move changes through the graph as we make changes.
     auto propagate_default_min_max =
         absl::make_unique<PropagateDefaultMinMax>();
-    if (toco_flags.has_default_ranges_min() &&
-        toco_flags.has_default_ranges_max()) {
+    bool has_default_ranges_flag = (toco_flags.has_default_ranges_min() &&
+                                    toco_flags.has_default_ranges_max());
+    if (has_default_ranges_flag) {
       propagate_default_min_max->DefineTypeRange(
           ArrayDataType::kUint8, toco_flags.default_ranges_min(),
           toco_flags.default_ranges_max());
@@ -335,6 +336,8 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
         new EnsureUint8WeightsSafeForFastInt8Kernels;
     ensure_safe_for_int8_kernels->set_allow_nudging_weights(
         toco_flags.allow_nudging_weights_to_use_fast_gemm_kernel());
+    ensure_safe_for_int8_kernels->set_has_default_ranges_flag(
+        has_default_ranges_flag);
     RunGraphTransformations(model, "quantization graph transformations",
                             {
                                 new RemoveTrivialQuantizedActivationFunc,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 52f8df45a21a4b1606496c76a0b535c7900b611d..93c30dd0f857c0666de78e39edf8bec96bc4a1f1 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -356,6 +356,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ReduceMin)  //  Reduction Min
     HANDLE_OPERATORTYPENAME_CASE(Minimum)    //  Element-wise Minimum
     HANDLE_OPERATORTYPENAME_CASE(Neg)
+    HANDLE_OPERATORTYPENAME_CASE(OneHot)
     HANDLE_OPERATORTYPENAME_CASE(Pack)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
     HANDLE_OPERATORTYPENAME_CASE(PadV2)
@@ -1585,11 +1586,6 @@ void ResolveModelFlags(const ModelFlags& model_flags, Model* model) {
                                model);
   }
 
-  for (const auto& input_array : model->flags.input_arrays()) {
-    if (input_array.has_shape()) {
-      CHECK(input_array.shape().dims_size());
-    }
-  }
   model->flags.set_change_concat_input_ranges(
       model_flags.change_concat_input_ranges());
   model->flags.set_allow_nonascii_arrays(model_flags.allow_nonascii_arrays());
diff --git a/tensorflow/contrib/lite/toco/tooling_util_test.cc b/tensorflow/contrib/lite/toco/tooling_util_test.cc
index 8609e5beddd200be4e5ebfe1fb2a79048e0e60ab..eb495646a2df0d0295eab54fcc5a5bf156a59d39 100644
--- a/tensorflow/contrib/lite/toco/tooling_util_test.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util_test.cc
@@ -39,6 +39,8 @@ std::vector<ShapePair> CreateShapePairs() {
        {Shape({256, 256, 3}), Shape({256, 256, 3}), Agreement::kBroadcast},
        {Shape({256, 256, 3}), Shape({3}), Agreement::kBroadcast},
        {Shape({8, 1, 6, 1}), Shape({7, 1, 5}), Agreement::kBroadcast},
+       {Shape({}), Shape({3}), Agreement::kBroadcast},
+       {Shape({}), Shape({3, 1}), Agreement::kBroadcast},
 
        // These extend (and therefore broadcast).
        {Shape({3}), Shape({3}), Agreement::kExtend},
@@ -54,6 +56,7 @@ std::vector<ShapePair> CreateShapePairs() {
        {Shape({15, 3, 5}), Shape({15, 1, 5}), Agreement::kBroadcastNotExtend},
        {Shape({15, 3, 5}), Shape({3, 5}), Agreement::kBroadcastNotExtend},
        {Shape({15, 3, 5}), Shape({3, 1}), Agreement::kBroadcastNotExtend},
+       {Shape({3, 1}), Shape({}), Agreement::kBroadcastNotExtend},
 
        // These do not broadcast (and therefore also do not extend).
        {Shape({3}), Shape({4}), Agreement::kNeither},
@@ -175,6 +178,20 @@ TEST(NumElementsTest, UnsignedInt64) {
   EXPECT_EQ(status.error_message(), kLargeTensorMessage);
 }
 
+TEST(NumElementsTest, Scalar) {
+  tensorflow::Status status = tensorflow::Status::OK();
+
+  int32_t count;
+  status = NumElements(std::vector<int32_t>{}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 1);
+
+  uint64_t countu64;
+  status = NumElements(std::vector<uint64_t>{}, &countu64);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(countu64, 1ULL);
+}
+
 TEST(FusedActivationTest, DefaultsToUnfused) {
   EXPECT_TRUE(OperatorSupportsFusedActivation(OperatorType::kAdd));
   EXPECT_FALSE(OperatorSupportsFusedActivation(OperatorType::kNone));
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index 810e25961f844b71af2d37ff4d45006a6c74054d..2cb07eb6ec9405a5fefec9cc49f3b1aaff663e4b 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -10,11 +10,16 @@ load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 
 common_copts = ["-Wall"] + tflite_copts()
 
+cc_library(
+    name = "logging",
+    hdrs = ["logging.h"],
+    copts = common_copts,
+)
+
 cc_binary(
     name = "benchmark_model",
     srcs = [
         "benchmark_main.cc",
-        "logging.h",
     ],
     copts = common_copts,
     linkopts = tflite_linkopts() + select({
@@ -26,6 +31,26 @@ cc_binary(
     }),
     deps = [
         ":benchmark_tflite_model_lib",
+        ":logging",
+    ],
+)
+
+cc_test(
+    name = "benchmark_test",
+    srcs = ["benchmark_test.cc"],
+    args = [
+        "--graph=$(location //tensorflow/contrib/lite:testdata/multi_add.bin)",
+    ],
+    data = ["//tensorflow/contrib/lite:testdata/multi_add.bin"],
+    tags = [
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":benchmark_tflite_model_lib",
+        ":command_line_flags",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -40,7 +65,6 @@ cc_test(
     name = "command_line_flags_test",
     srcs = ["command_line_flags_test.cc"],
     copts = common_copts,
-    tags = ["no_oss"],
     visibility = ["//visibility:private"],
     deps = [
         ":command_line_flags",
@@ -59,6 +83,7 @@ cc_library(
     copts = common_copts,
     deps = [
         ":benchmark_model_lib",
+        ":logging",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
@@ -71,23 +96,23 @@ cc_library(
     name = "benchmark_params",
     srcs = [
         "benchmark_params.cc",
-        "logging.h",
     ],
     hdrs = ["benchmark_params.h"],
     copts = common_copts,
+    deps = [":logging"],
 )
 
 cc_library(
     name = "benchmark_model_lib",
     srcs = [
         "benchmark_model.cc",
-        "logging.h",
     ],
     hdrs = ["benchmark_model.h"],
     copts = common_copts,
     deps = [
         ":benchmark_params",
         ":command_line_flags",
+        ":logging",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
index 19b9a9c7ba40ad8241632dda77db77a5e1ce8e63..f86c0445b0525cd053c733b18bb7f1205d310d43 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
@@ -84,7 +84,7 @@ std::vector<Flag> BenchmarkModel::GetFlags() {
   };
 }
 
-void BenchmarkModel::LogFlags() {
+void BenchmarkModel::LogParams() {
   TFLITE_LOG(INFO) << "Num runs: [" << params_.Get<int32_t>("num_runs") << "]";
   TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
                    << params_.Get<float>("run_delay") << "]";
@@ -122,12 +122,18 @@ Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
   return run_stats;
 }
 
+bool BenchmarkModel::ValidateParams() { return true; }
+
 void BenchmarkModel::Run(int argc, char **argv) {
   if (!ParseFlags(argc, argv)) {
     return;
   }
+  Run();
+}
 
-  LogFlags();
+void BenchmarkModel::Run() {
+  ValidateParams();
+  LogParams();
 
   listeners_.OnBenchmarkStart(params_);
   int64_t initialization_start_us = profiling::time::NowMicros();
@@ -155,7 +161,7 @@ bool BenchmarkModel::ParseFlags(int argc, char **argv) {
     TFLITE_LOG(ERROR) << usage;
     return false;
   }
-  return ValidateFlags();
+  return true;
 }
 
 }  // namespace benchmark
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
index 3c7063b2d49b4d0de22a74490aea0d62383da6a8..677a1ee68c247fb016c7ede4e1a614bacb7a0a15 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
@@ -137,16 +137,17 @@ class BenchmarkModel {
   BenchmarkModel();
   BenchmarkModel(BenchmarkParams params) : params_(std::move(params)) {}
   virtual ~BenchmarkModel() {}
-  bool ParseFlags(int argc, char** argv);
   virtual void Init() = 0;
   void Run(int argc, char** argv);
+  virtual void Run();
   void AddListener(BenchmarkListener* listener) {
     listeners_.AddListener(listener);
   }
 
  protected:
-  virtual void LogFlags();
-  virtual bool ValidateFlags() { return true; }
+  virtual void LogParams();
+  virtual bool ValidateParams();
+  bool ParseFlags(int argc, char** argv);
   virtual std::vector<Flag> GetFlags();
   virtual uint64_t ComputeInputBytes() = 0;
   virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b697bb394db9b967dfaaff649517dcc23e85ccb0
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_test.cc
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+
+namespace {
+const std::string* g_model_path = nullptr;
+}
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+BenchmarkParams CreateParams() {
+  BenchmarkParams params;
+  params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
+  params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
+  params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("graph", BenchmarkParam::Create<std::string>(*g_model_path));
+  params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
+  params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  return params;
+}
+
+TEST(BenchmarkTest, DoesntCrash) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  BenchmarkTfLiteModel benchmark(CreateParams());
+  benchmark.Run();
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  std::string model_path;
+  std::vector<tflite::Flag> flags = {
+      tflite::Flag::CreateFlag("graph", &model_path, "Path to model file.")};
+  g_model_path = &model_path;
+  const bool parse_result =
+      tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flags);
+  if (!parse_result) {
+    std::cerr << tflite::Flags::Usage(argv[0], flags);
+    return 1;
+  }
+
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index 73affc26b034f415ae2a2101e0b558cdb94d8d5b..7f97f5d0cd6c412653f6d510406daf86b7baa3f7 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -198,8 +198,8 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
   return flags;
 }
 
-void BenchmarkTfLiteModel::LogFlags() {
-  BenchmarkModel::LogFlags();
+void BenchmarkTfLiteModel::LogParams() {
+  BenchmarkModel::LogParams();
   TFLITE_LOG(INFO) << "Graph: [" << params_.Get<std::string>("graph") << "]";
   TFLITE_LOG(INFO) << "Input layers: ["
                    << params_.Get<std::string>("input_layer") << "]";
@@ -208,7 +208,7 @@ void BenchmarkTfLiteModel::LogFlags() {
   TFLITE_LOG(INFO) << "Use nnapi : [" << params_.Get<bool>("use_nnapi") << "]";
 }
 
-bool BenchmarkTfLiteModel::ValidateFlags() {
+bool BenchmarkTfLiteModel::ValidateParams() {
   if (params_.Get<std::string>("graph").empty()) {
     TFLITE_LOG(ERROR)
         << "Please specify the name of your TF Lite input file with --graph";
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index 50cc3f24b3bd2f31555eac69ff208fa2480449b9..9931dcbafe06cb9f8673462858244f6f2793b29d 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -54,8 +54,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   BenchmarkTfLiteModel(BenchmarkParams params);
 
   std::vector<Flag> GetFlags() override;
-  void LogFlags() override;
-  bool ValidateFlags() override;
+  void LogParams() override;
+  bool ValidateParams() override;
   uint64_t ComputeInputBytes() override;
   void Init() override;
   void RunImpl() override;
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 889accdd5aafae2931048ffdd26408cccb3c874e..8d510ede5827df3889307c0f38572bece84f102e 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -280,6 +280,21 @@ class HashTableOpTest(test.TestCase):
       table.init.run()
       self.assertAllEqual(3, table.size().eval())
 
+  def testHashTableInt32String(self):
+    with self.test_session():
+      default_val = "n/a"
+      keys = constant_op.constant([0, 1, 2], dtypes.int32)
+      values = constant_op.constant(["brain", "salad", "surgery"])
+      table = lookup.HashTable(
+          lookup.KeyValueTensorInitializer(keys, values), default_val)
+      table.init.run()
+
+      input_tensor = constant_op.constant([0, 1, -1])
+      output = table.lookup(input_tensor)
+
+      result = output.eval()
+      self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
+
 
 class MutableHashTableOpTest(test.TestCase):
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index b14202ff9ec38016f926ee37c8acbd2bbb4c6ef5..a328670526089988c181a8e1146c911309640009 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -3715,6 +3715,7 @@ def count(values,
           name=None):
   """Computes the number of examples, or sum of `weights`.
 
+  This metric keeps track of the denominator in `tf.metrics.mean`.
   When evaluating some metric (e.g. mean) on one or more subsets of the data,
   this auxiliary metric is useful for keeping track of how many examples there
   are in each subset.
@@ -3741,15 +3742,21 @@ def count(values,
     ValueError: If `weights` is not `None` and its shape doesn't match `values`,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
+    RuntimeError: If eager execution is enabled.
   """
+  if context.executing_eagerly():
+    raise RuntimeError('tf.contrib.metrics.count is not supported when eager '
+                       'execution is enabled.')
 
   with variable_scope.variable_scope(name, 'count', (values, weights)):
+
     count_ = metrics_impl.metric_variable([], dtypes.float32, name='count')
 
     if weights is None:
       num_values = math_ops.to_float(array_ops.size(values))
     else:
-      _, _, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
+      values = math_ops.to_float(values)
+      values, _, weights = metrics_impl._remove_squeezable_dimensions(  # pylint: disable=protected-access
           predictions=values,
           labels=None,
           weights=weights)
@@ -3758,15 +3765,14 @@ def count(values,
       num_values = math_ops.reduce_sum(weights)
 
     with ops.control_dependencies([values]):
-      update_op = state_ops.assign_add(count_, num_values)
+      update_count_op = state_ops.assign_add(count_, num_values)
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, count_)
+    count_ = metrics_impl._aggregate_variable(count_, metrics_collections)  # pylint: disable=protected-access
 
     if updates_collections:
-      ops.add_to_collections(updates_collections, update_op)
+      ops.add_to_collections(updates_collections, update_count_op)
 
-    return count_, update_op
+    return count_, update_count_op
 
 
 def cohen_kappa(labels,
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index a09fc4abd461323d67e914c70932688816fed764..401fedcbed8fef12308d563d108725a418dfef17 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -6854,6 +6854,11 @@ class CountTest(test.TestCase):
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  def testReturnType(self):
+    c, op = metrics.count(array_ops.ones([4, 3]))
+    self.assertTrue(isinstance(c, ops.Tensor))
+    self.assertTrue(isinstance(op, ops.Operation) or isinstance(op, ops.Tensor))
+
   def testBasic(self):
     with self.test_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
diff --git a/tensorflow/contrib/model_pruning/README.md b/tensorflow/contrib/model_pruning/README.md
index 86f4fd6adf60d8fa54c13989bf4087e28f1e006f..9143d082bf08fefa7aa522455eb3af911e636ae0 100644
--- a/tensorflow/contrib/model_pruning/README.md
+++ b/tensorflow/contrib/model_pruning/README.md
@@ -66,10 +66,10 @@ is the sparsity_function_begin_step. In this equation, the
 sparsity_function_exponent is set to 3.
 ### Adding pruning ops to the training graph
 
-The final step involves adding ops to the training graph that monitors the
-distribution of the layer's weight magnitudes and determines the layer threshold
-such masking all the weights below this threshold achieves the sparsity level
-desired for the current training step. This can be achieved as follows:
+The final step involves adding ops to the training graph that monitor the
+distribution of the layer's weight magnitudes and determine the layer threshold,
+such that masking all the weights below this threshold achieves the sparsity
+level desired for the current training step. This can be achieved as follows:
 
 ```python
 tf.app.flags.DEFINE_string(
@@ -79,7 +79,7 @@ tf.app.flags.DEFINE_string(
 with tf.graph.as_default():
 
   # Create global step variable
-  global_step = tf.train.get_global_step()
+  global_step = tf.train.get_or_create_global_step()
 
   # Parse pruning hyperparameters
   pruning_hparams = pruning.get_pruning_hparams().parse(FLAGS.pruning_hparams)
@@ -103,6 +103,7 @@ with tf.graph.as_default():
     mon_sess.run(mask_update_op)
 
 ```
+Ensure that `global_step` is being [incremented](https://www.tensorflow.org/api_docs/python/tf/train/Optimizer#minimize), otherwise pruning will not work!
 
 ## Example: Pruning and training deep CNNs on the cifar10 dataset
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index 4b7af18b3316950afdb90c344ce777848c63e4c1..da9d398cbc06299a33ab400cc9b4d780531211db 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -518,11 +518,11 @@ class Pruning(object):
       summary.scalar('last_mask_update_step', self._last_update_step)
       masks = get_masks()
       thresholds = get_thresholds()
-      for index, mask in enumerate(masks):
+      for mask, threshold in zip(masks, thresholds):
         if not self._exists_in_do_not_prune_list(mask.name):
-          summary.scalar(mask.name + '/sparsity', nn_impl.zero_fraction(mask))
-          summary.scalar(thresholds[index].op.name + '/threshold',
-                         thresholds[index])
+          summary.scalar(mask.op.name + '/sparsity', 
+                         nn_impl.zero_fraction(mask))
+          summary.scalar(threshold.op.name + '/threshold', threshold)
 
   def print_hparams(self):
     logging.info(self._spec.to_json())
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 5225ecc14fef3cec9506eceb776805b74a87714e..3ba3ee29ec79687df522eb330665a2ce80061682 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -193,6 +193,7 @@ cuda_py_test(
     srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":training",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework",
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
index ec033c4a0163ba9ed39e55fa9e92dfdadc9a1b2f..a44bfd1bfd97e678fbf4c402ef5b1298dc518f75 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2_test.py
@@ -38,12 +38,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testBasic(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       # Note that for eager execution, minimize expects a function instead of a
@@ -131,12 +127,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradients(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       # pylint: disable=cell-var-from-loop
       def loss():
         return 5 * var0
@@ -149,12 +141,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_Minimize(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return constant_op.constant(5.0)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
@@ -165,12 +153,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testNoGradientsForAnyVariables_ApplyGradients(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a_%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b_%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
       with self.assertRaisesRegexp(ValueError,
                                    'No gradients provided for any variable'):
@@ -179,12 +163,8 @@ class OptimizerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testGradientsAsVariables(self):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
-      # Note that we name the variables uniquely here since the variables don't
-      # seem to be getting deleted at the end of the loop.
-      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype,
-                                                    name='a%d' % i)
-      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype,
-                                                    name='b%d' % i)
+      var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+      var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
       def loss():
         return 5 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
       sgd_op = gradient_descent.GradientDescentOptimizer(3.0)
diff --git a/tensorflow/contrib/optimizer_v2/rmsprop_test.py b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
index ed68f6afbf8bf9678649c1ce6fc59c3b91026dc0..dc23ef241a43900ed40f029f1b857820459e43d0 100644
--- a/tensorflow/contrib/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/contrib/optimizer_v2/rmsprop_test.py
@@ -19,15 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import itertools
 import math
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.optimizer_v2 import rmsprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,13 +49,8 @@ _TEST_PARAM_VALUES = [
     [0.5, 0.95, 0.9, 1e-5, True, False],
 ]
 
-_TESTPARAMS = [
-    [data_type] + values
-    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
-]
-
 
-class RMSPropOptimizerTest(test.TestCase):
+class RMSPropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, decay, momentum,
                             epsilon, centered):
@@ -87,362 +83,366 @@ class RMSPropOptimizerTest(test.TestCase):
       var_t[gindex] = var[gindex] - mom_t[gindex]
     return var_t, mg_t, rms_t, mom_t
 
-  def testDense(self):
-    # TODO(yori): Use ParameterizedTest when available
-    for (dtype, learning_rate, decay, momentum,
-         epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
-
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable(var0_np)
-          var1 = resource_variable_ops.ResourceVariable(var1_np)
-        else:
-          var0 = variables.Variable(var0_np)
-          var1 = variables.Variable(var1_np)
-        grads0 = constant_op.constant(grads0_np)
-        grads1 = constant_op.constant(grads1_np)
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=learning_rate,
-            decay=decay,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        mg0 = opt.get_slot(var0, "mg")
-        self.assertEqual(mg0 is not None, centered)
-        mg1 = opt.get_slot(var1, "mg")
-        self.assertEqual(mg1 is not None, centered)
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        # Run 4 steps of RMSProp
-        for _ in range(1, 5):
-          update.run()
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
-              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
-              decay, momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
-              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
-              decay, momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  def testMinimizeSparseResourceVariable(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = rmsprop.RMSPropOptimizer(
-            learning_rate=1.0,
-            decay=0.0,
-            momentum=0.0,
-            epsilon=0.0,
-            centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
-
-  def testMinimizeSparseResourceVariableCentered(self):
-    for dtype in [dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
-        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
-        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
-        loss = pred * pred
-        sgd_op = rmsprop.RMSPropOptimizer(
-            learning_rate=1.0,
-            decay=0.0,
-            momentum=0.0,
-            epsilon=1.0,
-            centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        # Run 1 step of sgd
-        sgd_op.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
-
-  def testSparse(self):
-    # TODO(yori): Use ParameterizedTest when available
-    for (dtype, learning_rate, decay,
-         momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.test_session(use_gpu=True):
-        # Initialize variables for numpy implementation.
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
-
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testDense(self, dtype, param_value):
+    (learning_rate, decay, momentum, epsilon, centered, use_resource) = tuple(
+        param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+      else:
         var0 = variables.Variable(var0_np)
         var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0], dtype=np.int32)
-        grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
-        grads1_np_indices = np.array([1], dtype=np.int32)
-        grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=learning_rate,
-            decay=decay,
-            momentum=momentum,
-            epsilon=epsilon,
-            centered=centered)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        mg0 = opt.get_slot(var0, "mg")
-        self.assertEqual(mg0 is not None, centered)
-        mg1 = opt.get_slot(var1, "mg")
-        self.assertEqual(mg1 is not None, centered)
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
-        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-
-        # Run 4 steps of RMSProp
-        for _ in range(1, 5):
-          update.run()
-
-          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
-              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
-              learning_rate, decay, momentum, epsilon, centered)
-          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
-              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
-              learning_rate, decay, momentum, epsilon, centered)
-
-          # Validate updated params
-          if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
-  def testWithoutMomentum(self):
-    for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-        # Step 1: the rms accumulators where 1. So we should see a normal
-        # update: v -= grad * learning_rate
-        update.run()
-        # Check the root mean square accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
-        # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
-        # Check the rms accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
-
-  def testWithMomentum(self):
-    for dtype in [dtypes.half, dtypes.float32]:
-      with self.test_session(use_gpu=True):
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-
-        opt = rmsprop.RMSPropOptimizer(
-            learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
-        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-
-        rms0 = opt.get_slot(var0, "rms")
-        self.assertTrue(rms0 is not None)
-        rms1 = opt.get_slot(var1, "rms")
-        self.assertTrue(rms1 is not None)
-        mom0 = opt.get_slot(var0, "momentum")
-        self.assertTrue(mom0 is not None)
-        mom1 = opt.get_slot(var1, "momentum")
-        self.assertTrue(mom1 is not None)
-
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-        # Step 1: rms = 1, mom = 0. So we should see a normal
-        # update: v -= grad * learning_rate
+      grads0 = constant_op.constant(grads0_np)
+      grads1 = constant_op.constant(grads1_np)
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=learning_rate,
+          decay=decay,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
         update.run()
-        # Check the root mean square accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
-        # Check the momentum accumulators
-        self.assertAllCloseAccordingToType(
-            np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
-
-        # Check that the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
-
-        # Step 2: the root mean square accumulators contain the previous update.
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+            var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
+            decay, momentum, epsilon, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+            var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate,
+            decay, momentum, epsilon, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(var0_np, var0.eval())
+        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariable(self, dtype):
+    with self.test_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSPropOptimizer(
+          learning_rate=1.0,
+          decay=0.0,
+          momentum=0.0,
+          epsilon=0.0,
+          centered=False).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[0., 1.]], var0.eval(), atol=0.01)
+
+  @parameterized.parameters([dtypes.float32, dtypes.float64])
+  def testMinimizeSparseResourceVariableCentered(self, dtype):
+    with self.test_session():
+      var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+      x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+      pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+      loss = pred * pred
+      sgd_op = rmsprop.RMSPropOptimizer(
+          learning_rate=1.0,
+          decay=0.0,
+          momentum=0.0,
+          epsilon=1.0,
+          centered=True).minimize(loss)
+      variables.global_variables_initializer().run()
+      # Fetch params to validate initial values
+      self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+      # Run 1 step of sgd
+      sgd_op.run()
+      # Validate updated params
+      self.assertAllCloseAccordingToType(
+          [[-111, -138]], var0.eval(), atol=0.01)
+
+  @parameterized.named_parameters(
+      *test_util.generate_combinations_with_testcase_name(
+          dtype=_DATA_TYPES, param_value=_TEST_PARAM_VALUES))
+  def testSparse(self, dtype, param_value):
+    (learning_rate, decay, momentum, epsilon, centered, _) = tuple(
+        param_value)
+    with self.test_session(use_gpu=True):
+      # Initialize variables for numpy implementation.
+      var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+      grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+      var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+      grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+      var0 = variables.Variable(var0_np)
+      var1 = variables.Variable(var1_np)
+      grads0_np_indices = np.array([0], dtype=np.int32)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant(grads0_np),
+          constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+      grads1_np_indices = np.array([1], dtype=np.int32)
+      grads1 = ops.IndexedSlices(
+          constant_op.constant(grads1_np),
+          constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=learning_rate,
+          decay=decay,
+          momentum=momentum,
+          epsilon=epsilon,
+          centered=centered)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      mg0 = opt.get_slot(var0, "mg")
+      self.assertEqual(mg0 is not None, centered)
+      mg1 = opt.get_slot(var1, "mg")
+      self.assertEqual(mg1 is not None, centered)
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      rms0_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      rms1_np = np.array([1.0, 1.0], dtype=dtype.as_numpy_dtype)
+      mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+      mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+
+      # Run 4 steps of RMSProp
+      for _ in range(4):
         update.run()
-        # Check the rms accumulators.
-        self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
-                0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([
-                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
-                0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
-
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
-                (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
-                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
-
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
-                (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
-                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+
+        var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+            var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+            learning_rate, decay, momentum, epsilon, centered)
+        var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+            var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+            learning_rate, decay, momentum, epsilon, centered)
+
+        # Validate updated params
+        if centered:
+          self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
+          self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
+        self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
+        self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
+        self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
+        self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
+        self.assertAllCloseAccordingToType(var0_np, var0.eval())
+        self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithoutMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: the rms accumulators where 1. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+          ]), var1.eval())
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+          ]), var1.eval())
+
+  @parameterized.parameters(_DATA_TYPES)
+  def testWithMomentum(self, dtype):
+    with self.test_session(use_gpu=True):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+      opt = rmsprop.RMSPropOptimizer(
+          learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
+      update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      variables.global_variables_initializer().run()
+
+      rms0 = opt.get_slot(var0, "rms")
+      self.assertIsNotNone(rms0)
+      rms1 = opt.get_slot(var1, "rms")
+      self.assertIsNotNone(rms1)
+      mom0 = opt.get_slot(var0, "momentum")
+      self.assertIsNotNone(mom0)
+      mom1 = opt.get_slot(var1, "momentum")
+      self.assertIsNotNone(mom1)
+
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], var0.eval())
+      self.assertAllClose([3.0, 4.0], var1.eval())
+      # Step 1: rms = 1, mom = 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      update.run()
+      # Check the root mean square accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901, 0.901]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001, 0.90001]), rms1.eval())
+      # Check the momentum accumulators
+      self.assertAllCloseAccordingToType(
+          np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+                    (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+                    (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+
+      # Check that the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
+          ]), var0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
+          ]), var1.eval())
+
+      # Step 2: the root mean square accumulators contain the previous update.
+      update.run()
+      # Check the rms accumulators.
+      self.assertAllCloseAccordingToType(
+          np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
+              0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+              (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
+          ]), mom0.eval())
+      self.assertAllCloseAccordingToType(
+          np.array([
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
+              0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+              (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
+          ]), mom1.eval())
+
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
+              (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
+               (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
+          ]), var0.eval())
+
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
+              (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
+               (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
+          ]), var1.eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/proto/BUILD b/tensorflow/contrib/proto/BUILD
index d45622174f21d9d104321cd56e47a3d120bcc03d..b27142cf4a6413eccb8489ea3eb775060ffd787b 100644
--- a/tensorflow/contrib/proto/BUILD
+++ b/tensorflow/contrib/proto/BUILD
@@ -16,15 +16,3 @@ py_library(
         "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
     ],
 )
-
-py_library(
-    name = "proto_pip",
-    data = if_static(
-        [],
-        otherwise = ["//tensorflow/contrib/proto/python/kernel_tests:libtestexample.so"],
-    ),
-    deps = [
-        ":proto",
-        "//tensorflow/contrib/proto/python/kernel_tests:py_test_deps",
-    ],
-)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/BUILD b/tensorflow/contrib/proto/python/kernel_tests/BUILD
index 3c6fde23d2a321d5b9ba24b610f08a2646f5b2d1..125c1cee292092e55bc17294a29f175c8cc3999c 100644
--- a/tensorflow/contrib/proto/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/proto/python/kernel_tests/BUILD
@@ -100,3 +100,30 @@ tf_cc_shared_object(
         ":test_example_proto_cc",
     ],
 )
+
+py_library(
+    name = "descriptor_source_test_base",
+    testonly = 1,
+    srcs = ["descriptor_source_test_base.py"],
+    deps = [
+        ":proto_op_test_base",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@protobuf_archive//:protobuf_python",
+    ],
+)
+
+tf_py_test(
+    name = "descriptor_source_test",
+    size = "small",
+    srcs = ["descriptor_source_test.py"],
+    additional_deps = [
+        ":descriptor_source_test_base",
+        "//tensorflow/contrib/proto/python/ops:decode_proto_op_py",
+        "//tensorflow/contrib/proto/python/ops:encode_proto_op_py",
+        "//tensorflow/python:client_testlib",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
diff --git a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..32ca318f733ce11221539838dfdbcf710dca51a1
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test.py
@@ -0,0 +1,36 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for proto ops reading descriptors from other sources."""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.proto.python.kernel_tests import descriptor_source_test_base as test_base
+from tensorflow.contrib.proto.python.ops import decode_proto_op
+from tensorflow.contrib.proto.python.ops import encode_proto_op
+from tensorflow.python.platform import test
+
+
+class DescriptorSourceTest(test_base.DescriptorSourceTestBase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    super(DescriptorSourceTest, self).__init__(decode_proto_op, encode_proto_op,
+                                               methodName)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a1c04af324620fc893583ebb17cd99ea3ba166d
--- /dev/null
+++ b/tensorflow/contrib/proto/python/kernel_tests/descriptor_source_test_base.py
@@ -0,0 +1,176 @@
+# =============================================================================
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for proto ops reading descriptors from other sources."""
+# Python3 preparedness imports.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from google.protobuf.descriptor_pb2 import FieldDescriptorProto
+from google.protobuf.descriptor_pb2 import FileDescriptorSet
+from tensorflow.contrib.proto.python.kernel_tests import proto_op_test_base as test_base
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class DescriptorSourceTestBase(test.TestCase):
+  """Base class for testing descriptor sources."""
+
+  def __init__(self, decode_module, encode_module, methodName='runTest'):  # pylint: disable=invalid-name
+    """DescriptorSourceTestBase initializer.
+
+    Args:
+      decode_module: a module containing the `decode_proto_op` method
+      encode_module: a module containing the `encode_proto_op` method
+      methodName: the name of the test method (same as for test.TestCase)
+    """
+
+    super(DescriptorSourceTestBase, self).__init__(methodName)
+    self._decode_module = decode_module
+    self._encode_module = encode_module
+
+  # NOTE: We generate the descriptor programmatically instead of via a compiler
+  # because of differences between different versions of the compiler.
+  #
+  # The generated descriptor should capture the subset of `test_example.proto`
+  # used in `test_base.simple_test_case()`.
+  def _createDescriptorFile(self):
+    set_proto = FileDescriptorSet()
+
+    file_proto = set_proto.file.add(
+        name='types.proto',
+        package='tensorflow',
+        syntax='proto3')
+    enum_proto = file_proto.enum_type.add(name='DataType')
+    enum_proto.value.add(name='DT_DOUBLE', number=0)
+    enum_proto.value.add(name='DT_BOOL', number=1)
+
+    file_proto = set_proto.file.add(
+        name='test_example.proto',
+        package='tensorflow.contrib.proto',
+        dependency=['types.proto'])
+    message_proto = file_proto.message_type.add(name='TestCase')
+    message_proto.field.add(
+        name='values',
+        number=1,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.TestValue',
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='shapes',
+        number=2,
+        type=FieldDescriptorProto.TYPE_INT32,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='sizes',
+        number=3,
+        type=FieldDescriptorProto.TYPE_INT32,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='fields',
+        number=4,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.FieldSpec',
+        label=FieldDescriptorProto.LABEL_REPEATED)
+
+    message_proto = file_proto.message_type.add(
+        name='TestValue')
+    message_proto.field.add(
+        name='double_value',
+        number=1,
+        type=FieldDescriptorProto.TYPE_DOUBLE,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+    message_proto.field.add(
+        name='bool_value',
+        number=2,
+        type=FieldDescriptorProto.TYPE_BOOL,
+        label=FieldDescriptorProto.LABEL_REPEATED)
+
+    message_proto = file_proto.message_type.add(
+        name='FieldSpec')
+    message_proto.field.add(
+        name='name',
+        number=1,
+        type=FieldDescriptorProto.TYPE_STRING,
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+    message_proto.field.add(
+        name='dtype',
+        number=2,
+        type=FieldDescriptorProto.TYPE_ENUM,
+        type_name='.tensorflow.DataType',
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+    message_proto.field.add(
+        name='value',
+        number=3,
+        type=FieldDescriptorProto.TYPE_MESSAGE,
+        type_name='.tensorflow.contrib.proto.TestValue',
+        label=FieldDescriptorProto.LABEL_OPTIONAL)
+
+    fn = os.path.join(self.get_temp_dir(), 'descriptor.pb')
+    with open(fn, 'wb') as f:
+      f.write(set_proto.SerializeToString())
+    return fn
+
+  def _testRoundtrip(self, descriptor_source):
+    # Numpy silently truncates the strings if you don't specify dtype=object.
+    in_bufs = np.array(
+        [test_base.ProtoOpTestBase.simple_test_case().SerializeToString()],
+        dtype=object)
+    message_type = 'tensorflow.contrib.proto.TestCase'
+    field_names = ['values', 'shapes', 'sizes', 'fields']
+    tensor_types = [dtypes.string, dtypes.int32, dtypes.int32, dtypes.string]
+
+    with self.test_session() as sess:
+      sizes, field_tensors = self._decode_module.decode_proto(
+          in_bufs,
+          message_type=message_type,
+          field_names=field_names,
+          output_types=tensor_types,
+          descriptor_source=descriptor_source)
+
+      out_tensors = self._encode_module.encode_proto(
+          sizes,
+          field_tensors,
+          message_type=message_type,
+          field_names=field_names,
+          descriptor_source=descriptor_source)
+
+      out_bufs, = sess.run([out_tensors])
+
+      # Check that the re-encoded tensor has the same shape.
+      self.assertEqual(in_bufs.shape, out_bufs.shape)
+
+      # Compare the input and output.
+      for in_buf, out_buf in zip(in_bufs.flat, out_bufs.flat):
+        # Check that the input and output serialized messages are identical.
+        # If we fail here, there is a difference in the serialized
+        # representation but the new serialization still parses. This could
+        # be harmless (a change in map ordering?) or it could be bad (e.g.
+        # loss of packing in the encoding).
+        self.assertEqual(in_buf, out_buf)
+
+  def testWithFileDescriptorSet(self):
+    # First try parsing with a local proto db, which should fail.
+    with self.assertRaisesOpError('No descriptor found for message type'):
+      self._testRoundtrip('local://')
+
+    # Now try parsing with a FileDescriptorSet which contains the test proto.
+    descriptor_file = self._createDescriptorFile()
+    self._testRoundtrip(descriptor_file)
diff --git a/tensorflow/contrib/rnn/python/ops/rnn.py b/tensorflow/contrib/rnn/python/ops/rnn.py
index 2f0caadda336b878e58e973e1c995cbec65d5732..0266b72dcb15e4aba01a9a31b4be75c5b84d44da 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn.py
@@ -175,7 +175,7 @@ def stack_bidirectional_dynamic_rnn(cells_fw,
   Returns:
     A tuple (outputs, output_state_fw, output_state_bw) where:
       outputs: Output `Tensor` shaped:
-        `batch_size, max_time, layers_output]`. Where layers_output
+        `[batch_size, max_time, layers_output]`. Where layers_output
         are depth-concatenated forward and backward outputs.
       output_states_fw is the final states, one tensor per layer,
         of the forward rnn.
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index 26fd4e2023806765ea4088f4c13a780ca7338bff..fbb50befdfb2ccbd97465c11f8219e604a0ebc18 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -93,3 +93,32 @@ py_test(
         "//tensorflow/python/saved_model:utils",
     ],
 )
+
+py_library(
+    name = "keras_saved_model",
+    srcs = ["python/saved_model/keras_saved_model.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/saved_model:constants",
+    ],
+)
+
+py_test(
+    name = "keras_saved_model_test",
+    size = "small",
+    srcs = ["python/saved_model/keras_saved_model_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":saved_model_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/keras",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/saved_model/__init__.py b/tensorflow/contrib/saved_model/__init__.py
index b4f27a055dad7a5b95112d561cc878609a558f8d..95e1a8967b2223fd3feb112af3cbe0c5991d2d03 100644
--- a/tensorflow/contrib/saved_model/__init__.py
+++ b/tensorflow/contrib/saved_model/__init__.py
@@ -24,11 +24,12 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import,line-too-long
+from tensorflow.contrib.saved_model.python.saved_model.keras_saved_model import *
 from tensorflow.contrib.saved_model.python.saved_model.signature_def_utils import *
 # pylint: enable=unused-import,widcard-import,line-too-long
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ["get_signature_def_by_key"]
+_allowed_symbols = ["get_signature_def_by_key", "load_model", "save_model"]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/saved_model/python/saved_model/__init__.py b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
index 7b91622b6127413ce122c4166a18255b65365d32..e3b76bb6f34846f02ccdf623d48ddd9c5909fdce 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/__init__.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/__init__.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import
+from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2a969f053d3f1ded8aecd6411a62a198df48bb0
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -0,0 +1,108 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utility functions to save/load keras Model to/from SavedModel."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.keras.models import model_from_json
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.util import compat
+
+
+def save_model(model, saved_model_path):
+  """Save a `tf.keras.Model` into Tensorflow SavedModel format.
+
+  `save_model` generates such files/folders under the `saved_model_path` folder:
+  1) an asset folder containing the json string of the model's
+  configuration(topology).
+  2) a checkpoint containing the model weights.
+
+  Note that subclassed models can not be saved via this function, unless you
+  provide an implementation for get_config() and from_config().
+  Also note that `tf.keras.optimizers.Optimizer` instances can not currently be
+  saved to checkpoints. Use optimizers from `tf.train`.
+
+  Args:
+    model: A `tf.keras.Model` to be saved.
+    saved_model_path: a string specifying the path to the SavedModel directory.
+
+  Raises:
+    NotImplementedError: If the passed in model is a subclassed model.
+  """
+  if not model._is_graph_network:
+    raise NotImplementedError
+
+  # save model configuration as a json string under assets folder.
+  model_json = model.to_json()
+  assets_destination_dir = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY))
+
+  if not file_io.file_exists(assets_destination_dir):
+    file_io.recursive_create_dir(assets_destination_dir)
+
+  model_json_filepath = os.path.join(
+      compat.as_bytes(assets_destination_dir),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  file_io.write_string_to_file(model_json_filepath, model_json)
+
+  # save model weights in checkpoint format.
+  checkpoint_destination_dir = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.VARIABLES_DIRECTORY))
+
+  if not file_io.file_exists(checkpoint_destination_dir):
+    file_io.recursive_create_dir(checkpoint_destination_dir)
+
+  checkpoint_prefix = os.path.join(
+      compat.as_text(checkpoint_destination_dir),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.save_weights(checkpoint_prefix, save_format='tf', overwrite=True)
+
+
+def load_model(saved_model_path):
+  """Load a keras.Model from SavedModel.
+
+  load_model reinstantiates model state by:
+  1) loading model topology from json (this will eventually come
+     from metagraph).
+  2) loading model weights from checkpoint.
+
+  Args:
+    saved_model_path: a string specifying the path to an existing SavedModel.
+
+  Returns:
+    a keras.Model instance.
+  """
+  # restore model topology from json string
+  model_json_filepath = os.path.join(
+      compat.as_bytes(saved_model_path),
+      compat.as_bytes(constants.ASSETS_DIRECTORY),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_JSON))
+  model_json = file_io.read_file_to_string(model_json_filepath)
+  model = model_from_json(model_json)
+
+  # restore model weights
+  checkpoint_prefix = os.path.join(
+      compat.as_text(saved_model_path),
+      compat.as_text(constants.VARIABLES_DIRECTORY),
+      compat.as_text(constants.VARIABLES_FILENAME))
+  model.load_weights(checkpoint_prefix)
+  return model
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..107ae1b07b777570e4124337595ceecd6e33cd0b
--- /dev/null
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -0,0 +1,201 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Tests for saving/loading function for keras Model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import numpy as np
+
+from tensorflow.contrib.saved_model.python.saved_model import keras_saved_model
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.platform import test
+from tensorflow.python.training import training as training_module
+
+
+class TestModelSavingandLoading(test.TestCase):
+
+  def test_saving_sequential_model(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy],
+          sample_weight_mode='temporal')
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_sequential_model_without_compile(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      ref_y = model.predict(x)
+
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_functional_model(self):
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+      model.compile(
+          loss=keras.losses.MSE,
+          optimizer=keras.optimizers.RMSprop(lr=0.0001),
+          metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_functional_model_without_compile(self):
+    with self.test_session():
+      inputs = keras.layers.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      output = keras.layers.Dense(3)(x)
+
+      model = keras.models.Model(inputs, output)
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_saving_with_tf_optimizer(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.Dense(3))
+      model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      ref_y = model.predict(x)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+
+      temp_saved_model = os.path.join(temp_dir, 'saved_model')
+      keras_saved_model.save_model(model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      loaded_model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test that new updates are the same with both models
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+
+      ref_loss = model.train_on_batch(x, y)
+      loss = loaded_model.train_on_batch(x, y)
+      self.assertAllClose(ref_loss, loss, atol=1e-05)
+
+      ref_y = model.predict(x)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+      # test saving/loading again
+      keras_saved_model.save_model(loaded_model, temp_saved_model)
+      loaded_model = keras_saved_model.load_model(temp_saved_model)
+      y = loaded_model.predict(x)
+      self.assertAllClose(ref_y, y, atol=1e-05)
+
+  def test_saving_subclassed_model_raise_error(self):
+    # For now, saving subclassed model should raise an error. It should be
+    # avoided later with loading from SavedModel.pb.
+
+    class SubclassedModel(training.Model):
+
+      def __init__(self):
+        super(SubclassedModel, self).__init__()
+        self.layer1 = keras.layers.Dense(3)
+        self.layer2 = keras.layers.Dense(1)
+
+      def call(self, inp):
+        return self.layer2(self.layer1(inp))
+
+    model = SubclassedModel()
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    temp_saved_model = os.path.join(temp_dir, 'saved_model')
+    with self.assertRaises(NotImplementedError):
+      keras_saved_model.save_model(model, temp_saved_model)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
index 345eb6cfaa67fd4cda6e7e3f01a1243bbf3c9fa1..f4348e80eac54933d67cdf7bd281d6a9c6c10381 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/mel_ops_test.py
@@ -53,7 +53,8 @@ def spectrogram_to_mel_matrix(num_mel_bins=20,
                               num_spectrogram_bins=129,
                               audio_sample_rate=8000,
                               lower_edge_hertz=125.0,
-                              upper_edge_hertz=3800.0):
+                              upper_edge_hertz=3800.0,
+                              unused_dtype=None):
   """Return a matrix that can post-multiply spectrogram rows to make mel.
 
   Copied from
@@ -132,9 +133,9 @@ class LinearToMelTest(test.TestCase):
     # lower_edge_hertz, upper_edge_hertz) to test.
     configs = [
         # Defaults.
-        (20, 129, 8000.0, 125.0, 3800.0),
+        (20, 129, 8000.0, 125.0, 3800.0, dtypes.float64),
         # Settings used by Tacotron (https://arxiv.org/abs/1703.10135).
-        (80, 1025, 24000.0, 80.0, 12000.0)
+        (80, 1025, 24000.0, 80.0, 12000.0, dtypes.float64)
     ]
     with self.test_session(use_gpu=True):
       for config in configs:
@@ -143,7 +144,8 @@ class LinearToMelTest(test.TestCase):
         self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
 
   def test_dtypes(self):
-    for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+    # LinSpace is not supported for tf.float16.
+    for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
@@ -167,7 +169,8 @@ class LinearToMelTest(test.TestCase):
 
   def test_constant_folding(self):
     """Mel functions should be constant foldable."""
-    for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
+    # TODO(rjryan): tf.bloat16 cannot be constant folded by Grappler.
+    for dtype in (dtypes.float32, dtypes.float64):
       g = ops.Graph()
       with g.as_default():
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(dtype=dtype)
diff --git a/tensorflow/contrib/signal/python/ops/mel_ops.py b/tensorflow/contrib/signal/python/ops/mel_ops.py
index 1e84006116daa3f28c760037cb9eeafd53eaafb8..062d84aea183ab61501a8b07521adb1a1a17c63c 100644
--- a/tensorflow/contrib/signal/python/ops/mel_ops.py
+++ b/tensorflow/contrib/signal/python/ops/mel_ops.py
@@ -151,22 +151,21 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
     _validate_arguments(num_mel_bins, sample_rate,
                         lower_edge_hertz, upper_edge_hertz, dtype)
 
-    # To preserve accuracy, we compute the matrix at float64 precision and then
-    # cast to `dtype` at the end. This function can be constant folded by graph
-    # optimization since there are no Tensor inputs.
+    # This function can be constant folded by graph optimization since there are
+    # no Tensor inputs.
     sample_rate = ops.convert_to_tensor(
-        sample_rate, dtypes.float64, name='sample_rate')
+        sample_rate, dtype, name='sample_rate')
     lower_edge_hertz = ops.convert_to_tensor(
-        lower_edge_hertz, dtypes.float64, name='lower_edge_hertz')
+        lower_edge_hertz, dtype, name='lower_edge_hertz')
     upper_edge_hertz = ops.convert_to_tensor(
-        upper_edge_hertz, dtypes.float64, name='upper_edge_hertz')
-    zero_float64 = ops.convert_to_tensor(0.0, dtypes.float64)
+        upper_edge_hertz, dtype, name='upper_edge_hertz')
+    zero = ops.convert_to_tensor(0.0, dtype)
 
     # HTK excludes the spectrogram DC bin.
     bands_to_zero = 1
     nyquist_hertz = sample_rate / 2.0
     linear_frequencies = math_ops.linspace(
-        zero_float64, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
+        zero, nyquist_hertz, num_spectrogram_bins)[bands_to_zero:]
     spectrogram_bins_mel = array_ops.expand_dims(
         _hertz_to_mel(linear_frequencies), 1)
 
@@ -193,11 +192,8 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
 
     # Intersect the line segments with each other and zero.
     mel_weights_matrix = math_ops.maximum(
-        zero_float64, math_ops.minimum(lower_slopes, upper_slopes))
+        zero, math_ops.minimum(lower_slopes, upper_slopes))
 
     # Re-add the zeroed lower bins we sliced out above.
-    mel_weights_matrix = array_ops.pad(
-        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]])
-
-    # Cast to the desired type.
-    return math_ops.cast(mel_weights_matrix, dtype, name=name)
+    return array_ops.pad(
+        mel_weights_matrix, [[bands_to_zero, 0], [0, 0]], name=name)
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 3e41e3d0b48ea06f9cb8c1862e27eacb5ebc4417..4d1807130c57039976dfa57c27bb0d4807e75212 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -20,6 +20,8 @@ import os
 import tempfile
 import time
 
+import sqlite3
+
 import numpy as np
 import six
 
@@ -275,6 +277,22 @@ class EagerFileTest(test_util.TensorFlowTestCase):
 
 class EagerDbTest(summary_test_util.SummaryDbTest):
 
+  def testDbURIOpen(self):
+    tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
+    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(
+        tmpdb_uri,
+        "experimentA",
+        "run1",
+        "user1")
+    with summary_ops.always_record_summaries():
+      with tmpdb_writer.as_default():
+        summary_ops.scalar('t1', 2.0)
+    tmpdb = sqlite3.connect(tmpdb_path)
+    num = get_one(tmpdb, 'SELECT count(*) FROM Tags WHERE tag_name = "t1"')
+    self.assertEqual(num, 1)
+    tmpdb.close()
+
   def testIntegerSummaries(self):
     step = training_util.create_global_step()
     writer = self.create_db_writer()
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 70ce4a499c39ae29fac80441b754c3a674161340..46f3c36e3db51fde4c8732d4300a9d3eaddb452a 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -11,7 +11,6 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "cuda_py_test",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
@@ -20,6 +19,7 @@ load(
     "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc")
@@ -32,9 +32,14 @@ tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
     srcs = ["tensorrt_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
     deps = [
+        "//tensorflow/core:gpu_init",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ] + if_tensorrt([
@@ -80,6 +85,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
+        ":trt_allocator",
         ":trt_logging",
         ":trt_plugins",
         ":trt_resources",
@@ -116,7 +122,6 @@ tf_cuda_library(
 
 tf_gen_op_wrapper_py(
     name = "trt_engine_op",
-    gen_locally = True,
     deps = [
         ":trt_engine_op_op_lib",
         ":trt_logging",
@@ -153,6 +158,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":tf_trt_integration_test_base",
         ":trt_convert_py",
         ":trt_ops_py",
         "//tensorflow/python:errors",
@@ -195,17 +201,16 @@ tf_py_wrap_cc(
 tf_cuda_library(
     name = "trt_resources",
     srcs = [
-        "resources/trt_allocator.cc",
         "resources/trt_int8_calibrator.cc",
         "resources/trt_resource_manager.cc",
     ],
     hdrs = [
-        "resources/trt_allocator.h",
         "resources/trt_int8_calibrator.h",
         "resources/trt_resource_manager.h",
         "resources/trt_resources.h",
     ],
     deps = [
+        ":trt_allocator",
         ":trt_logging",
         ":utils",
         "//tensorflow/core:framework_headers_lib",
@@ -216,6 +221,34 @@ tf_cuda_library(
     ]),
 )
 
+tf_cuda_library(
+    name = "trt_allocator",
+    srcs = ["resources/trt_allocator.cc"],
+    hdrs = ["resources/trt_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_cc_test(
+    name = "trt_allocator_test",
+    size = "small",
+    srcs = ["resources/trt_allocator_test.cc"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
+    deps = [
+        ":trt_allocator",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # Library for the node-level conversion portion of TensorRT operation creation
 tf_cuda_library(
     name = "trt_conversion",
@@ -231,6 +264,7 @@ tf_cuda_library(
     ],
     deps = [
         ":segment",
+        ":trt_allocator",
         ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
@@ -275,14 +309,21 @@ tf_cc_test(
     name = "segment_test",
     size = "small",
     srcs = ["segment/segment_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "nomac",
+    ],
     deps = [
         ":segment",
-        "//tensorflow/c:c_api",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -311,7 +352,11 @@ tf_cuda_cc_test(
     name = "trt_plugin_factory_test",
     size = "small",
     srcs = ["plugin/trt_plugin_factory_test.cc"],
-    tags = ["no_windows"],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_windows",
+        "nomac",
+    ],
     deps = [
         ":trt_plugins",
         "//tensorflow/core:lib",
@@ -323,16 +368,40 @@ tf_cuda_cc_test(
     ]),
 )
 
-cuda_py_test(
+py_library(
+    name = "tf_trt_integration_test_base",
+    srcs = ["test/tf_trt_integration_test_base.py"],
+    deps = [
+        ":trt_convert_py",
+        ":trt_ops_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+    ],
+)
+
+cuda_py_tests(
     name = "tf_trt_integration_test",
-    srcs = ["test/tf_trt_integration_test.py"],
+    srcs = [
+        "test/base_test.py",
+        # "test/batch_matmul_test.py",
+        # "test/biasadd_matmul_test.py",
+        # "test/binary_tensor_weight_broadcast_test.py",  # Blocked by trt4 installation
+        # "test/concatenation_test.py",  # Blocked by trt4 installation
+        "test/const_broadcast_test.py",
+        "test/multi_connection_neighbor_engine_test.py",
+        "test/neighboring_engine_test.py",
+        # "test/unary_test.py",  # Blocked by trt4 installation
+        # "test/vgg_block_nchw_test.py",
+        # "test/vgg_block_test.py",
+        "test/memory_alignment_test.py",
+    ],
     additional_deps = [
-        ":init_py",
+        ":tf_trt_integration_test_base",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
     ],
-    main = "test/tf_trt_integration_test.py",
     tags = [
+        "no_cuda_on_cpu_tap",
         "no_windows",
         "nomac",
     ],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 68c78e8301a4693f0937e193ee2f071480a208e3..3383f6bc9b99879a1c661a0d49e42c6f3b878f66 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -301,7 +301,8 @@ tensorflow::Status GetEngineInfo(
     const int node_id = node->id();
     for (const auto edge : node->in_edges()) {
       auto input_node = edge->src();
-      if (segment_nodes.count(input_node->name()) == 0) {
+      if (segment_nodes.count(input_node->name()) == 0 &&
+          !edge->IsControlEdge() && !input_node->IsSource()) {
         // Add constant input node into the segment. We don't care if it has
         // other output edges going into other engines or TF nodes. Since we add
         // it only to the subsegment node list, not the subsegment itself, it
@@ -312,7 +313,7 @@ tensorflow::Status GetEngineInfo(
             added_const_node_ids.insert(input_node->id());
             subgraph_node_ids.push_back(input_node->id());
           }
-        } else if (!edge->IsControlEdge() && !input_node->IsSource()) {
+        } else {
           string s(input_node->name());
           StrAppend(&s, ":", edge->src_output());
           VLOG(1) << "Input edge = " << s;
@@ -378,9 +379,9 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
                                  nvinfer1::IGpuAllocator* alloc,
                                  int max_batch_size) {
   const auto& info = infos.at(pos);
-  std::vector<tensorflow::TensorShapeProto> out_shapes;
-  std::vector<tensorflow::TensorShapeProto> input_shapes;
-  std::vector<tensorflow::PartialTensorShape> shapes;
+  std::vector<tensorflow::TensorShapeProto> output_shape_protos;
+  std::vector<tensorflow::TensorShapeProto> input_shape_protos;
+  std::vector<tensorflow::PartialTensorShape> input_shapes;
   std::vector<tensorflow::NodeDefBuilder::NodeOut> inputs;
   std::vector<tensorflow::DataType> out_types;
   VLOG(1) << "Processing " << info.engine_name;
@@ -393,11 +394,11 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
       tensorflow::TensorShapeProto out_shape;
       // shape of the output node inside segment
       conn.inside_shape.AsProto(&out_shape);
-      if (out_shapes.size() <= conn.port_number) {
-        out_shapes.resize(conn.port_number + 1);
+      if (output_shape_protos.size() <= conn.port_number) {
+        output_shape_protos.resize(conn.port_number + 1);
         out_types.resize(conn.port_number + 1);
       }
-      out_shapes.at(conn.port_number) = out_shape;
+      output_shape_protos.at(conn.port_number) = out_shape;
       out_types.at(conn.port_number) = conn.connection_type;
       continue;
     }
@@ -405,12 +406,12 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     // Set the shapes and data types of input edge.
     tensorflow::TensorShapeProto in_shape;
     conn.outside_shape.AsProto(&in_shape);
-    if (input_shapes.size() <= conn.port_number) {
+    if (input_shape_protos.size() <= conn.port_number) {
+      input_shape_protos.resize(conn.port_number + 1);
       input_shapes.resize(conn.port_number + 1);
-      shapes.resize(conn.port_number + 1);
     }
-    input_shapes.at(conn.port_number) = in_shape;
-    shapes.at(conn.port_number) = conn.outside_shape;
+    input_shape_protos.at(conn.port_number) = in_shape;
+    input_shapes.at(conn.port_number) = conn.outside_shape;
 
     string input_node = conn.outside_node_name;
     int input_port = conn.outside_port;
@@ -438,6 +439,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     VLOG(1) << "Engine Input " << input_node << ":" << input_port << " -> "
             << info.engine_name << ":" << inputs.size();
     // Skip duplicate inputs.
+    // TODO(aaroey): use std::find instead. GetEngineInfo already remove
+    // duplicate connections, so here we should never find any duplicate?
     bool new_input = true;
     for (const auto& inp : inputs) {
       if (inp.node == input_node && inp.index == input_port) {
@@ -465,8 +468,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
         info.segment_graph_def,
         info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
-        max_batch_size, info.max_workspace_size_bytes, shapes, &trt_logger,
-        alloc, /*calibrator=*/nullptr, &engine,
+        max_batch_size, info.max_workspace_size_bytes, input_shapes,
+        &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
@@ -514,8 +517,8 @@ tensorflow::Status CreateTRTNode(tensorflow::Graph* graph,
   }
   tensorflow::NodeDef trt_node;
   tensorflow::Status status =
-      node_builder.Attr("input_shapes", input_shapes)
-          .Attr("output_shapes", out_shapes)
+      node_builder.Attr("input_shapes", input_shape_protos)
+          .Attr("output_shapes", output_shape_protos)
           .Attr("static_engine",
                 info.engine_type == EngineInfo::EngineType::TRTStatic)
           .Attr("segment_funcdef_name",
@@ -734,6 +737,7 @@ std::pair<int, tensorflow::Allocator*> GetDeviceAndAllocator(
 }
 
 // Entry function from optimization pass.
+// TODO(aaeory): parameter should use pointer type.
 tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   // Convert graphdef to graph.
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
@@ -751,7 +755,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      &graph, IsTensorRTCandidate, segment_options, &initial_segments));
+      &graph, IsTensorRTCandidate, InputEdgeValidator(*params.graph_properties),
+      OutputEdgeValidator(), segment_options, &initial_segments));
   if (initial_segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
             << initial_segments.size();
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 65fef275338508f476615e2c73f6c0db31800c2b..451d6fe698bbcf89570fdf54fb3d780a731e7d74 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
+#include <cstring>
 #include <list>
 #include <map>
 #include <memory>
@@ -77,7 +78,6 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 using ::tensorflow::str_util::Split;
-
 using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
@@ -107,6 +107,59 @@ inline tensorflow::Status ConvertDType(tensorflow::DataType tf_dtype,
   return tensorflow::Status::OK();
 }
 
+void GetInputProperties(const grappler::GraphProperties& graph_properties,
+                        const Node* outside_node, const int out_port,
+                        PartialTensorShape* shape,
+                        tensorflow::DataType* dtype) {
+  if (graph_properties.HasOutputProperties(outside_node->name())) {
+    auto output_params =
+        graph_properties.GetOutputProperties(outside_node->name());
+    auto out_shape = output_params.at(out_port);
+    *dtype = out_shape.dtype();
+    *shape = out_shape.shape();
+  } else {
+    VLOG(0) << "Unknown output shape" << outside_node->name();
+    *dtype = outside_node->output_type(out_port);
+  }
+}
+
+void GetOutputProperties(const grappler::GraphProperties& graph_properties,
+                         const Node* outside_node, const int in_port,
+                         PartialTensorShape* shape,
+                         tensorflow::DataType* dtype) {
+  if (graph_properties.HasInputProperties(outside_node->name())) {
+    auto input_params =
+        graph_properties.GetInputProperties(outside_node->name());
+    auto in_shape = input_params.at(in_port);
+    *dtype = in_shape.dtype();
+    *shape = in_shape.shape();
+  } else {
+    *dtype = outside_node->input_type(in_port);
+  }
+}
+
+tensorflow::Status ValidateInputProperties(const PartialTensorShape& shape,
+                                           const tensorflow::DataType dtype,
+                                           nvinfer1::DataType* trt_dtype) {
+  // TODO(aaroey): some of these checks also apply to IsTensorRTCandidate(), so
+  // put them there instead.
+  TF_RETURN_IF_ERROR(ConvertDType(dtype, trt_dtype));
+  if (shape.dims() < 0) {
+    return tensorflow::errors::InvalidArgument("Input tensor rank is unknown.");
+  }
+  if (shape.dims() > 9) {
+    return tensorflow::errors::OutOfRange(
+        "Input tensor rank is greater than 8.");
+  }
+  for (int d = 1; d < shape.dims(); ++d) {
+    if (shape.dim_size(d) < 0) {
+      return tensorflow::errors::InvalidArgument(
+          "Input tensor has a unknown non-batch dimemension at dim ", d);
+    }
+  }
+  return Status::OK();
+}
+
 // Return whether or not the broadcast is feasible;
 bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
                                const bool operand_l_is_tensor,
@@ -2588,6 +2641,8 @@ void Converter::register_op_converters() {
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
   op_registry_["TopKV2"] = ConvertTopK;
 #endif
+
+  plugin_converter_ = ConvertPlugin;
 }
 
 }  // namespace
@@ -2640,25 +2695,22 @@ tensorflow::Status ConvertGraphDefToEngine(
         (node_def.op() == "Placeholder")) {
       nvinfer1::DimsCHW input_dim_pseudo_chw;
       for (int i = 0; i < 8; i++) input_dim_pseudo_chw.d[i] = 0;
-      nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-      auto type_status =
-          ConvertDType(node_def.attr().at("dtype").type(), &dtype);
-      if (type_status != tensorflow::Status::OK()) {
-        LOG(WARNING) << "Type conversion failed for " << node_name;
-        return type_status;
-      }
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 8,
-                                             &slot_number)) {
-        LOG(ERROR) << "Failed to parse slot number from " << node_name
-                   << " +8= " << node_name.c_str() + 8;
+      if (!tensorflow::strings::safe_strto32(
+              node_name.c_str() + strlen(kInputPHName), &slot_number)) {
+        return tensorflow::errors::InvalidArgument(
+            "Failed to parse slot number from ", node_name);
       }
+      nvinfer1::DataType dtype;
       auto shape = input_shapes.at(slot_number);
-      if (shape.dims() > 8) {
-        LOG(ERROR) << "Tensor rank is greater than 8 for " << node_name
-                   << " at input slot " << slot_number;
-        return tensorflow::errors::OutOfRange(
-            "Input tensor rank is greater than 8");
+      auto status = ValidateInputProperties(
+          shape, node_def.attr().at("dtype").type(), &dtype);
+      if (!status.ok()) {
+        const string error_message =
+            StrCat("Validation failed for ", node_name, " and input slot ",
+                   slot_number, ": ", status.error_message());
+        LOG(WARNING) << error_message;
+        return Status(status.code(), error_message);
       }
       if (VLOG_IS_ON(1)) {
         string dim_str("dims=");
@@ -2689,10 +2741,10 @@ tensorflow::Status ConvertGraphDefToEngine(
     } else if (tensorflow::str_util::StartsWith(node_name, kOutputPHName) &&
                (node_def.op() == "Identity")) {
       int32 slot_number = -1;
-      if (!tensorflow::strings::safe_strto32(node_name.c_str() + 9,
-                                             &slot_number)) {
-        LOG(ERROR) << "Failed to parse slot number from " << node_name
-                   << " +9=" << node_name.c_str() + 9;
+      if (!tensorflow::strings::safe_strto32(
+              node_name.c_str() + strlen(kOutputPHName), &slot_number)) {
+        return tensorflow::errors::InvalidArgument(
+            "Failed to parse slot number from ", node_name);
       }
       if (output_tensors.size() <= slot_number) {
         output_tensors.resize(slot_number + 1);
@@ -2751,38 +2803,20 @@ tensorflow::Status ConvertSegmentToGraphDef(
           "Cannot find node with id ", connection.outside_id, " in the graph.");
     }
     // Updates the shape and data types of input/output connections.
-    tensorflow::DataType input_type = tensorflow::DT_FLOAT;
+    tensorflow::DataType dtype;
     tensorflow::PartialTensorShape partial_shape;
     if (connection.is_input_edge) {
-      if (graph_properties.HasOutputProperties(connection.outside_node_name)) {
-        auto output_params =
-            graph_properties.GetOutputProperties(connection.outside_node_name);
-        auto out_shape = output_params.at(connection.outside_port);
-        input_type = out_shape.dtype();
-        std::vector<tensorflow::int64> dims;
-        partial_shape = out_shape.shape();
-        connection.outside_shape = partial_shape;
-      } else {
-        VLOG(0) << "Unknown output shape" << outside_node->name();
-        input_type = graph->FindNodeId(connection.outside_id)
-                         ->output_type(connection.outside_port);
-      }
-      connection.connection_type = input_type;
-
-    } else {  // output edge
-      if (graph_properties.HasInputProperties(connection.outside_node_name)) {
-        auto input_params =
-            graph_properties.GetInputProperties(connection.outside_node_name);
-        auto in_shape = input_params.at(connection.outside_port);
-        input_type = in_shape.dtype();
-        partial_shape = in_shape.shape();
-        connection.inside_shape = partial_shape;
-      } else {
-        input_type = graph->FindNodeId(connection.inside_id)
-                         ->output_type(connection.outside_port);
-      }
-      connection.connection_type = input_type;
+      GetInputProperties(graph_properties,
+                         graph->FindNodeId(connection.outside_id),
+                         connection.outside_port, &partial_shape, &dtype);
+
+    } else {
+      GetOutputProperties(graph_properties,
+                          graph->FindNodeId(connection.outside_id),
+                          connection.outside_port, &partial_shape, &dtype);
     }
+    connection.outside_shape = partial_shape;
+    connection.connection_type = dtype;
 
     // Add dummy input/output nodes to the segment graphdef.
     if (connection.is_input_edge) {
@@ -2798,7 +2832,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Placeholder");
       auto status = builder.Attr("shape", partial_shape)
-                        .Attr("dtype", input_type)
+                        .Attr("dtype", dtype)
                         .Finalize(seg_node);
       VLOG(1) << "Constructing input " << node_name << " for the edge "
               << connection.outside_node_name << ":" << connection.outside_port
@@ -2816,7 +2850,7 @@ tensorflow::Status ConvertSegmentToGraphDef(
       marker_nodes.insert(node_name);
       auto seg_node = segment_def->add_node();
       tensorflow::NodeDefBuilder builder(node_name, "Identity");
-      auto status = builder.Input(connection.inside_node_name, 0, input_type)
+      auto status = builder.Input(connection.inside_node_name, 0, dtype)
                         .Finalize(seg_node);
       VLOG(1) << "Constructing output " << node_name << " for the edge "
               << connection.inside_node_name << ":" << connection.inside_port
@@ -2854,6 +2888,38 @@ tensorflow::Status ConvertSegmentToGraphDef(
   return tensorflow::Status::OK();
 }
 
+bool InputEdgeValidator::operator()(const tensorflow::Edge* in_edge) const {
+  if (in_edge->IsControlEdge()) return true;
+  PartialTensorShape shape;
+  tensorflow::DataType dtype;
+  GetInputProperties(graph_properties_, in_edge->src(), in_edge->src_output(),
+                     &shape, &dtype);
+  nvinfer1::DataType trt_dtype;
+  Status status = ValidateInputProperties(shape, dtype, &trt_dtype);
+  if (!status.ok()) {
+    VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name()
+            << ": " << status;
+    return false;
+  }
+  if (shape.dims() < 3 && in_edge->src()->type_string() != "Const") {
+    VLOG(2) << "--> Need to remove input node " << in_edge->dst()->name()
+            << " which has an input at port " << in_edge->dst_input()
+            << " with #dim<3 and is not a const: " << shape;
+    return false;
+  }
+  return true;
+}
+
+bool OutputEdgeValidator::operator()(const tensorflow::Edge* out_edge) const {
+  if (out_edge->IsControlEdge()) return true;
+  if (out_edge->src()->type_string() == "Const") {
+    VLOG(2) << "--> Need to remove output node " << out_edge->src()->name()
+            << " which is a Const.";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 1a4c0e755d1cd1e88ac26c39996eb3a750421a0a..6ae60ec352587feb8b26d6fcc69c907a5f145760 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/contrib/tensorrt/convert/utils.h"
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -104,6 +105,8 @@ struct EngineInfo {
 //   topological order.
 // - segment_def: the output GraphDef, whose non-input/output nodedefs will be
 //   sorted in topological order.
+//
+// TODO(aaroey): add tests to validate these properties.
 tensorflow::Status ConvertSegmentToGraphDef(
     const tensorflow::Graph* graph,
     const tensorflow::grappler::GraphProperties& graph_properties,
@@ -128,6 +131,30 @@ tensorflow::Status ConvertGraphDefToEngine(
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
     bool* convert_successfully);
 
+// Helper class for the segmenter to determine whether an input edge to the TRT
+// segment is valid.
+class InputEdgeValidator {
+ public:
+  InputEdgeValidator(const grappler::GraphProperties& graph_properties)
+      : graph_properties_(graph_properties) {}
+
+  // Return true if the specified edge is eligible to be an input edge of the
+  // TRT segment.
+  bool operator()(const tensorflow::Edge* in_edge) const;
+
+ private:
+  const grappler::GraphProperties& graph_properties_;
+};
+
+// Helper class for the segmenter to determine whether an output edge from the
+// TRT segment is valid.
+class OutputEdgeValidator {
+ public:
+  // Return true if the specified edge is eligible to be an output edge of the
+  // TRT segment.
+  bool operator()(const tensorflow::Edge* out_edge) const;
+};
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/utils.cc b/tensorflow/contrib/tensorrt/convert/utils.cc
index 24591cf84b244a548441b876228c5819d82ddefb..17857cf4d002b663f38248cc0ff989915ec864b4 100644
--- a/tensorflow/contrib/tensorrt/convert/utils.cc
+++ b/tensorflow/contrib/tensorrt/convert/utils.cc
@@ -24,7 +24,7 @@ bool IsGoogleTensorRTEnabled() {
   // safely write code that uses tensorrt conditionally. E.g. if it does not
   // check for for tensorrt, and user mistakenly uses tensorrt, they will just
   // crash and burn.
-#ifdef GOOGLE_TENSORRT
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
   return true;
 #else
   return false;
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
index a89cf3ab8bfaecc74fc5890ccb7e7a7147278182..69058c5826822c519a69d50860c06b8ab3ec6578 100644
--- a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -112,7 +112,9 @@ cuda_py_test(
     ],
     tags = [
         "manual",
+        "no_windows",
         "noguitar",
+        "nomac",
         "notap",
     ],
 )
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 54009179a845f0f8aa79c2f6801e94ac7c00f535..6699b71d285f1f4fa8cc9bb66679c65e71d16dcc 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,9 +15,11 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include <algorithm>
+
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/convert/utils.h"
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resource_manager.h"
 #include "tensorflow/contrib/tensorrt/resources/trt_resources.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -43,11 +45,11 @@ using ::tensorflow::strings::StrCat;
 // Helps simultaneous execution of native and TRT engines.
 class AsyncHelper : public tensorflow::core::RefCounted {
  public:
-  AsyncHelper(tensorflow::AsyncOpKernel::DoneCallback done) { done_ = done; }
+  AsyncHelper(AsyncOpKernel::DoneCallback done) { done_ = done; }
   ~AsyncHelper() override { done_(); }
 
  private:
-  tensorflow::AsyncOpKernel::DoneCallback done_;
+  AsyncOpKernel::DoneCallback done_;
 };
 
 #define TYPECASE(dt, X, Y)                                                \
@@ -150,7 +152,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   }
 }
 
-void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
+void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
   if (!calibration_mode_) {
     VLOG(1) << "Executing native engine";
@@ -191,7 +193,7 @@ void TRTEngineOp::ExecuteNativeSegment(tensorflow::OpKernelContext* ctx,
            });
 }
 
-void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
+void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      AsyncHelper* helper) {
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
@@ -236,7 +238,7 @@ void TRTEngineOp::ExecuteCalibration(tensorflow::OpKernelContext* ctx,
   ExecuteNativeSegment(ctx, helper);
 }
 
-int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
+int TRTEngineOp::GetEngineBatch(OpKernelContext* ctx) {
   int num_batch = ctx->input(0).shape().dim_size(0);
   int smallest_engine = 0;
   for (const auto i : cached_engine_batches_) {
@@ -252,21 +254,20 @@ int TRTEngineOp::GetEngineBatch(tensorflow::OpKernelContext* ctx) {
       cached_engine_batches_.push_back(num_batch);
       VLOG(1) << "Running with batch size " << num_batch;
     } else {
-      string s("Engine buffer is full. buffer limit= ");
-      StrAppend(&s, max_cached_engines_, ", current entries= ");
-      for (auto i : cached_engine_batches_) StrAppend(&s, i, ", ");
-      StrAppend(&s, "Requested batch= ", num_batch);
-      LOG(ERROR) << s;
-      ctx->SetStatus(tensorflow::errors::ResourceExhausted(
-          "Requested batch size is not available and engine cache is full"));
+      string msg =
+          StrCat("Engine buffer is full. buffer limit=", max_cached_engines_,
+                 ", current entries=");
+      for (auto i : cached_engine_batches_) StrAppend(&msg, i, ",");
+      StrAppend(&msg, "Requested batch=", num_batch);
+      LOG(WARNING) << msg;
       return -1;
     }
   }
   return smallest_engine;
 }
 
-void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
-                               tensorflow::AsyncOpKernel::DoneCallback done) {
+void TRTEngineOp::ComputeAsync(OpKernelContext* ctx,
+                               AsyncOpKernel::DoneCallback done) {
   auto helper = new AsyncHelper(done);
   tensorflow::core::ScopedUnref sc(helper);
   if (calibration_mode_) {
@@ -274,32 +275,52 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
     return;
   }
   const int smallest_engine = GetEngineBatch(ctx);
-  if (smallest_engine < 0) return;  // GetEngineBatch already set the status.
+  if (smallest_engine < 0) {
+    LOG(WARNING) << "Failed to get engine batch, running native segment";
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
 
   const int num_batch = ctx->input(0).shape().dim_size(0);
   auto& engine_ctx_pair = GetEngine(smallest_engine, ctx);
   auto& trt_engine_ptr = engine_ctx_pair.first;
   if (!trt_engine_ptr) {
     LOG(WARNING) << "Engine retrieval for batch size " << num_batch
-                 << " failed Running native segment";
+                 << " failed. Running native segment";
     ExecuteNativeSegment(ctx, helper);
     return;
   }
+  const bool retry = ExecuteTrtEngine(ctx, num_batch, trt_engine_ptr.get(),
+                                      engine_ctx_pair.second.get());
+  if (retry) {
+    LOG(WARNING) << "Failed to execute engine, retrying with native segment";
+    ExecuteNativeSegment(ctx, helper);
+    return;
+  }
+}
 
+bool TRTEngineOp::ExecuteTrtEngine(
+    OpKernelContext* ctx, const int num_batch,
+    nvinfer1::ICudaEngine* trt_engine_ptr,
+    nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+  const bool kRetry = true;
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
-    const string inp_name = StrCat(kInputPHName, i);
+    const string input_name = StrCat(kInputPHName, i);
     const size_t binding_index =
-        trt_engine_ptr->getBindingIndex(inp_name.c_str());
+        trt_engine_ptr->getBindingIndex(input_name.c_str());
+    if (binding_index == -1) {
+      LOG(ERROR) << "Input node not found, at " << input_name;
+      return kRetry;
+    }
 
     const Tensor& input_tensor = ctx->input(i);
     const TensorShape& input_shape = input_tensor.shape();
     if (num_batch != input_shape.dim_size(0)) {
-      LOG(ERROR) << "input data inconsistent batch size";
-      ctx->SetStatus(tensorflow::errors::FailedPrecondition(
-          "Different batch sizes between input tensors"));
-      return;
+      LOG(ERROR) << "Input data has inconsistent batch size: " << num_batch
+                 << " vs " << input_shape.dim_size(0);
+      return kRetry;
     }
     auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
@@ -308,14 +329,10 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
         break;
       case nvinfer1::DataType::kHALF:
         LOG(ERROR) << "FP16 inputs are not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "FP16 inputs are not supported!"));
-        return;
+        return kRetry;
       case nvinfer1::DataType::kINT8:
         LOG(ERROR) << "INT8 inputs are not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "INT8 inputs are not supported!"));
-        return;
+        return kRetry;
 #if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] = (void*)(input_tensor.flat<int32>().data());
@@ -323,9 +340,7 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
 #endif
       default:
         LOG(ERROR) << "Unknown TRT data type: " << int(dtype);
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unknown output TRT data type! ", static_cast<int>(dtype)));
-        return;
+        return kRetry;
     }
   }
 
@@ -342,20 +357,23 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
       std::vector<int> trt_shape(dims.nbDims + 1);
       trt_shape[0] = num_batch;
       for (int j = 0; j < dims.nbDims; j++) trt_shape[j + 1] = dims.d[j];
-      OP_REQUIRES_OK(
-          ctx, TensorShapeUtils::MakeShape(trt_shape.data(), trt_shape.size(),
-                                           &output_shape));
+      auto status = TensorShapeUtils::MakeShape(
+          trt_shape.data(), trt_shape.size(), &output_shape);
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to get output shape: " << status;
+        return kRetry;
+      }
     } else {
-      LOG(ERROR) << "output node not found, at " << output_name;
-      ctx->SetStatus(tensorflow::errors::Internal("output ", output_name,
-                                                  " couldn't be found!"));
-      return;
+      LOG(ERROR) << "Output node not found, at " << output_name;
+      return kRetry;
     }
     auto status = ctx->allocate_output(i, output_shape, &output_tensor);
     if (!status.ok()) {
       LOG(ERROR) << "Allocating output failed with " << status;
       ctx->SetStatus(status);
-      return;
+      // Do not retry since we cannot allocate the same output twice.
+      // TODO(aaroey): ideally we should retry, fix this.
+      return !kRetry;
     }
     auto dtype = trt_engine_ptr->getBindingDataType(binding_index);
     switch (dtype) {
@@ -364,15 +382,11 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
         break;
       case nvinfer1::DataType::kHALF:
-        LOG(ERROR) << "half size is not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Half outputs are not supported!"));
-        return;
+        LOG(WARNING) << "half size is not supported yet!";
+        return kRetry;
       case nvinfer1::DataType::kINT8:
-        LOG(ERROR) << "int8 is not supported yet!";
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "INT8 outputs are not supported!"));
-        return;
+        LOG(WARNING) << "int8 is not supported yet!";
+        return kRetry;
 #if NV_TENSORRT_MAJOR > 3
       case nvinfer1::DataType::kINT32:
         buffers[binding_index] =
@@ -380,13 +394,11 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
         break;
 #endif
       default:
-        LOG(ERROR) << "Unknown TRT data type: " << static_cast<int>(dtype);
-        ctx->SetStatus(tensorflow::errors::InvalidArgument(
-            "Unsupported output data type! ", static_cast<int>(dtype)));
-        return;
+        LOG(WARNING) << "Unknown TRT data type: " << static_cast<int>(dtype);
+        return kRetry;
     }
   }
-  // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
+  // Copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
   const cudaStream_t* stream = CHECK_NOTNULL(
       reinterpret_cast<const cudaStream_t*>(ctx->op_device_context()
                                                 ->stream()
@@ -394,15 +406,14 @@ void TRTEngineOp::ComputeAsync(tensorflow::OpKernelContext* ctx,
                                                 ->GpuStreamMemberHack()));
 
   // TODO(jie): trt enqueue does not return error
-  auto& trt_execution_context_ptr = engine_ctx_pair.second;
   auto ret = trt_execution_context_ptr->enqueue(num_batch, &buffers[0], *stream,
                                                 nullptr);
   if (!ret) {
-    LOG(ERROR) << "Failed to enqueue batch for TRT engine: " << name();
-    ctx->SetStatus(tensorflow::errors::Internal(
-        "Failed to enqueue batch for TRT engine: ", name()));
+    LOG(WARNING) << "Failed to enqueue batch for TRT engine: " << name();
+    return kRetry;
   }
-  // sync should be done by TF.
+  // Synchronization will be done by TF.
+  return !kRetry;
 }
 
 TRTEngineOp::~TRTEngineOp() {
@@ -422,8 +433,6 @@ nvinfer1::IGpuAllocator* TRTEngineOp::GetAllocator(OpKernelContext* ctx) {
   if (!alloc) {
     LOG(ERROR) << "Can't find device allocator for gpu device "
                << device->name();
-    ctx->SetStatus(tensorflow::errors::Internal(
-        "Can't get device allocator for device ", device->name()));
     return nullptr;
   }
   allocator_.reset(new TRTDeviceAllocator(alloc));
@@ -450,14 +459,14 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 #if NV_TENSORRT_MAJOR > 3
     auto allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
-      // GetAllocator already set the Status.
       return null_pair;
     }
     infer->setGpuAllocator(allocator);
 #endif
     TrtUniquePtrType<nvinfer1::ICudaEngine> static_engine(
         infer->deserializeCudaEngine(serialized_segment_.c_str(),
-                                     serialized_segment_.size(), nullptr));
+                                     serialized_segment_.size(),
+                                     PluginFactoryTensorRT::GetInstance()));
     auto raw_static_engine = static_engine.get();
     const auto max_batch_size = raw_static_engine->getMaxBatchSize();
     engine_map_[max_batch_size] = {
@@ -466,7 +475,9 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
             raw_static_engine->createExecutionContext())};
     // Runtime is safe to delete after engine creation
     serialized_segment_.clear();
-    if (max_batch_size < batch_size) return null_pair;
+    if (max_batch_size < batch_size) {
+      return null_pair;
+    }
     return engine_map_.at(max_batch_size);
   }  // static_engine_
 
@@ -478,7 +489,6 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 #if NV_TENSORRT_MAJOR > 3
     allocator = GetAllocator(ctx);
     if (allocator == nullptr) {
-      // GetAllocator already set the Status.
       return null_pair;
     }
 #endif
@@ -502,9 +512,8 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
         // retry in the future.
         engine_map_[batch_size] = {nullptr, nullptr};
       }
-      LOG(ERROR) << "Engine creation for batch size " << batch_size
-                 << " failed " << status;
-      ctx->SetStatus(tensorflow::errors::Internal("Engine creation failed!"));
+      LOG(WARNING) << "Engine creation for batch size " << batch_size
+                   << " failed " << status;
       return null_pair;
     }
     VLOG(1) << "Conversion is done";
@@ -516,7 +525,7 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
 }
 
 tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
-    tensorflow::OpKernelContext* ctx, TRTCalibrationResource** cr) {
+    OpKernelContext* ctx, TRTCalibrationResource** cr) {
   auto cres = new TRTCalibrationResource();
   *cr = cres;
   // Get the allocator.
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 9265250605dc7bbf2d85c6853bc6e8bf379ce72f..59b744e6d35d603795c0e87c89c0a8d56c26b3cb 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -60,6 +60,12 @@ class TRTEngineOp : public AsyncOpKernel {
   // Execute replaced native segment as function Op.
   void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper);
 
+  // Execute the tensorrt engine. Returns whether we need to retry by running
+  // the native segment.
+  bool ExecuteTrtEngine(OpKernelContext* ctx, const int num_batch,
+                        nvinfer1::ICudaEngine* trt_engine_ptr,
+                        nvinfer1::IExecutionContext* trt_execution_context_ptr);
+
   // Allocate necessary resources for calibration
   Status AllocateCalibrationResources(OpKernelContext* ctx,
                                       TRTCalibrationResource** cr);
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
index 2bc591484dcaf5b35c39f3d0523dd89dcd152e6a..cccc91226265ed139fb8db0b71c40b868f729562 100644
--- a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
@@ -65,9 +65,6 @@ bool PluginFactoryTensorRT::RegisterPlugin(
 
 void PluginFactoryTensorRT::DestroyPlugins() {
   tensorflow::mutex_lock lock(instance_m_);
-  for (auto& owned_plugin_ptr : owned_plugins_) {
-    owned_plugin_ptr.release();
-  }
   owned_plugins_.clear();
 }
 
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
index 81d7330b49bfaf16bd9691d4b4fdc9b00d493955..d8f97bfbbc7adb10a5dda6fbc2f7a660f6cd7742 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -19,12 +19,42 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace tensorrt {
+
+// std::align is not supported, so this method mimic its behavior.
+void* Align(size_t alignment, size_t size, void*& ptr, size_t& space) {
+  QCHECK_GT(alignment, 0) << "alignment must be greater than 0.";
+  QCHECK_EQ(0, alignment & (alignment - 1)) << "Alignment must be power of 2.";
+  QCHECK_GT(size, 0) << "size must be greater than 0.";
+  QCHECK(ptr) << "ptr must not be nullptr.";
+  QCHECK_GT(space, 0) << "space must be greater than 0.";
+  const uintptr_t ptr_val = reinterpret_cast<uintptr_t>(ptr);
+  QCHECK_GE(ptr_val + space, ptr_val) << "Provided space overflows.";
 
+  if (size > space) return nullptr;
+  const uintptr_t aligned_ptr_val = ((ptr_val + alignment - 1) & -alignment);
+  if (aligned_ptr_val > ptr_val + space - size) return nullptr;
+  ptr = reinterpret_cast<void*>(aligned_ptr_val);
+  const uintptr_t diff = aligned_ptr_val - ptr_val;
+  space -= diff;
+  return ptr;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #if NV_TENSORRT_MAJOR > 2
-#include "cuda/include/cuda_runtime_api.h"
 
 namespace tensorflow {
 namespace tensorrt {
+
 void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
                                  uint32_t flags) {
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
@@ -44,17 +74,16 @@ void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
   assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
   size_t total_size = size + alignment;
   void* mem = allocator_->AllocateRaw(alignment, total_size);
-  if (!mem) {
-    return nullptr;
-  }
+  if (!mem) return nullptr;
 
   void* alloc_mem = mem;
-  CHECK(std::align(alignment, size, mem, total_size));
+  QCHECK(Align(alignment, size, mem, total_size));
   if (mem != alloc_mem) {
-    CHECK(mem_map_.insert({mem, alloc_mem}).second);
+    QCHECK(mem_map_.insert({mem, alloc_mem}).second);
   }
-  VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
-          << " @ " << mem;
+  VLOG(2) << "Allocated " << total_size << " bytes memory @" << alloc_mem
+          << "; aligned to " << size << " bytes @" << mem << " with alignment "
+          << alignment;
   return mem;
 }
 
@@ -80,5 +109,5 @@ void TRTDeviceAllocator::free(void* memory) {
 }  // namespace tensorflow
 
 #endif
-#endif
-#endif
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
index b8825b108db252310dcf665741c5c83704a64091..6f944920835b475fc7d12167dbcefa0111b6fb19 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_allocator.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -16,13 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
 #define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
 
-#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include <unordered_map>
+
 #include "tensorflow/core/framework/allocator.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "tensorrt/include/NvInfer.h"
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace tensorrt {
+// std::align is not supported, so this function mimic its behavior.
+void* Align(size_t alignment, size_t size, void*& ptr, size_t& space);
+}  // namespace tensorrt
+}  // namespace tensorflow
 
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
 #if NV_TENSORRT_MAJOR == 3
 // Define interface here temporarily until TRT 4.0 is released
 namespace nvinfer1 {
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f515ed03f245f11ad461bac07970c5001a56aaad
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+bool RunTest(const size_t alignment, const size_t size,
+             const intptr_t orig_ptr_val, const size_t orig_space) {
+  void* const orig_ptr = reinterpret_cast<void*>(orig_ptr_val);
+  void* ptr = orig_ptr;
+  size_t space = orig_space;
+  void* result = Align(alignment, size, ptr, space);
+  if (result == nullptr) {
+    EXPECT_EQ(orig_ptr, ptr);
+    EXPECT_EQ(orig_space, space);
+    return false;
+  } else {
+    EXPECT_EQ(result, ptr);
+    const intptr_t ptr_val = reinterpret_cast<intptr_t>(ptr);
+    EXPECT_EQ(0, ptr_val % alignment);
+    EXPECT_GE(ptr_val, orig_ptr_val);
+    EXPECT_GE(space, size);
+    EXPECT_LE(space, orig_space);
+    EXPECT_EQ(ptr_val + space, orig_ptr_val + orig_space);
+    return true;
+  }
+}
+
+TEST(TRTAllocatorTest, Align) {
+  for (const size_t space :
+       {1, 2, 3, 4, 7, 8, 9, 10, 16, 32, 511, 512, 513, 700, 12345}) {
+    for (size_t alignment = 1; alignment <= space * 4; alignment *= 2) {
+      for (const intptr_t ptr_val :
+           {1ul, alignment == 1 ? 1ul : alignment - 1, alignment, alignment + 1,
+            alignment + (alignment / 2)}) {
+        if (ptr_val % alignment == 0) {
+          for (const size_t size :
+               {1ul, space == 1 ? 1ul : space - 1, space, space + 1}) {
+            EXPECT_EQ(space >= size, RunTest(alignment, size, ptr_val, space));
+          }
+        } else {
+          EXPECT_FALSE(RunTest(alignment, space, ptr_val, space));
+          const size_t diff = alignment - ptr_val % alignment;
+          if (space > diff) {
+            EXPECT_TRUE(
+                RunTest(alignment, space - diff, ptr_val + diff, space - diff));
+            for (const size_t size :
+                 {1ul, space - diff > 1 ? space - diff - 1 : 1ul, space - diff,
+                  space - diff + 1, space - 1}) {
+              EXPECT_EQ(space - diff >= size,
+                        RunTest(alignment, size, ptr_val, space));
+            }
+          } else {
+            EXPECT_FALSE(RunTest(alignment, 1, ptr_val, space));
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index cc42913ecadc3e15fbb4a4a322f125579f075da2..008fffc95430b1c423788a4e958e06e700cac233 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 
+#include <queue>
 #include <set>
 #include <unordered_map>
 #include <vector>
@@ -32,6 +33,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 using ::tensorflow::strings::StrAppend;
+
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
 // the need to create a copy of the graph. It is composed of edges and nodes.
@@ -215,7 +217,7 @@ namespace {
 
 bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
                  const std::vector<SimpleNode*>& start) {
-  // copied from TF ReverseDFS.
+  // Copied from TF ReverseDFS, which only works for tensorflow::Graph.
   struct Work {
     SimpleNode* node;
     bool leave;  // Are we entering or leaving n?
@@ -269,6 +271,24 @@ bool CanContractEdge(const SimpleEdge* edge,
   //   1. Get all nodes incoming to 'dst', excluding 'src'
   //   2. Reverse DFS from those nodes
   //   3. If reverse DFS reaches 'src' then we have a cycle
+  //
+  // TODO(aaroey): there are several problems with the current approach:
+  // 1. src->dst->src, this is not detected but it should be;
+  // 2. src->dst->...(any node sequence that doesn't contain src)...->dst, this
+  //    is detected but it should not be.
+  //
+  // Note that it's fine that dst connects back to src indirectly (i.e. through
+  // a path with length > 1 that consists of intermedia nodes other than src).
+  // While loops is one example.
+  //
+  // The goal is to make sure that the trt subgraph:
+  // 1. has no loops (i.e. is a DAG), and
+  // 2. if there is a path in the subgraph from X to Y (X and Y are both nodes
+  //    in the subgraph), then all paths from X to Y are in the subgraph.
+  //
+  // To achieve this goal, the correct way seems to be:
+  // 1. remove any direct edge from src->dst;
+  // 2. detect if src can reach dst, if so they cannot be merged.
   std::vector<SimpleNode*> dfs_start_nodes;
   for (SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
@@ -276,8 +296,8 @@ bool CanContractEdge(const SimpleEdge* edge,
     }
   }
 
-  bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
-  return !is_cycle;
+  const bool has_cycle = CheckCycles(graph, src, dfs_start_nodes);
+  return !has_cycle;
 }
 }  // namespace
 
@@ -342,22 +362,20 @@ void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
 }
 
 tensorflow::Status SegmentGraph(
-    const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments) {
-  // Create a Graph representation of the GraphDef.
-  tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
-                                             gdef.library());
-  tensorflow::Graph graph(flib);
-  TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
-      tensorflow::GraphConstructorOptions(), gdef, &graph));
-  return SegmentGraph(&graph, candidate_fn, options, segments);
-}
-
-tensorflow::Status SegmentGraph(
-    tensorflow::Graph* tf_graph,
+    const tensorflow::Graph* tf_graph,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments) {
+  // Steps:
+  // 1. run the segmentation algorithm to find all the segments, which uses
+  //    candidate_fn to determine the candidates segment nodes;
+  // 2. for each segments, remove the nodes that are inputs/outputs of the
+  //    segment but are not eligible, using input/output_candidate_fn to
+  //    determine the eligibilities;
+  // 3. convert the segment into expected return format and return the result.
+
+  // --------------------------------- Step 1 ---------------------------------
   auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
@@ -372,14 +390,19 @@ tensorflow::Status SegmentGraph(
     node_segments.emplace_back(node);
   }
 
-  // The segmentation algorithm below visits nodes in reverse
-  // topological order and attempts to merge nodes along output
-  // edges. That means that subgraphs grow from the output-side of the
-  // network towards the inputs. In general this is not guaranteed to
-  // produce a globally optimal segmentation. In the future if we have
-  // a measure of how beneficial it is to include a given node in a
-  // TRT subgraph then we can revisit this algorithm to take advantage
-  // of that information.
+  // The segmentation algorithm below visits nodes in reverse topological order
+  // and attempts to merge nodes along output edges. That means that subgraphs
+  // grow from the output-side of the network towards the inputs.
+  //
+  // In general this is not guaranteed to produce a globally optimal
+  // segmentation. For exaample, consider graph with node {A, B, C, D} and edges
+  // {A->B, A->C, B->D, C->D), where A, B, D are trt compatible but C is not, so
+  // in theory we can choose to contract either A, B or B, D but not both, but
+  // here it always choose to contract B, D.
+  //
+  // In the future if we have a measure of how beneficial it is to include a
+  // given node in a TRT subgraph then we can revisit this algorithm to take
+  // advantage of that information.
   std::vector<tensorflow::Node*> tforder;
   tensorflow::GetPostOrder(*tf_graph, &tforder);
   // use postorder implementation from tensorflow and construct mirror in
@@ -392,13 +415,11 @@ tensorflow::Status SegmentGraph(
   for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
-
     // 'node' must be a TRT candidate...
     if (node_segments[node->id()].Value() == nullptr) {
       VLOG(2) << "... not a TRT candidate";
       continue;
     }
-
     // Contract output edges to combine 'node' with output
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
@@ -416,7 +437,6 @@ tensorflow::Status SegmentGraph(
           VLOG(2) << "... ... not a TRT candidate";
           continue;
         }
-
         if (CanContractEdge(out_edge, graph)) {
           VLOG(2) << "... ... can contract";
           contract_edges.insert(out_edge);
@@ -424,11 +444,9 @@ tensorflow::Status SegmentGraph(
           VLOG(2) << "... ... cannot contract, would form cycle";
         }
       }
-
       if (contract_edges.empty()) {
         break;
       }
-
       // Contract edges and collect the adjacent nodes into the same
       // segment/subgraph.
       while (!contract_edges.empty()) {
@@ -457,11 +475,22 @@ tensorflow::Status SegmentGraph(
 
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
-  std::unordered_map<string, std::set<string>> sg_map;
+
+  // A map from the segment identifier (currently the name of the root node of
+  // the segment tree) to the segment nodes set.
+  std::unordered_map<string, std::set<const tensorflow::Node*>> sg_map;
+
+  // A map from the segment identifier (currently the name of the root node of
+  // the segment tree) to the device names that the nodes in the segment are
+  // assigned to.
+  //
+  // TODO(aaroey): nodes assigned to different devices should not be merged,
+  // fix this.
   std::unordered_map<string, std::set<string>> device_maps;
+
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
-      sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+      sg_map[u.ParentValue()->name()].insert(u.Value()->tf_node());
       auto tf_node = u.Value()->tf_node();
       // has_assigned_device_name() is expected to return true
       // when called from optimization pass. However, since graph
@@ -482,25 +511,104 @@ tensorflow::Status SegmentGraph(
     }
   }
 
+  // --------------------------------- Step 2 ---------------------------------
+  // Remove ineligible input/output nodes.
+  for (auto& itr : sg_map) {
+    std::set<const tensorflow::Node*>& segment_nodes = itr.second;
+    VLOG(1) << "Segment original size: " << segment_nodes.size();
+    while (true) {
+      std::deque<const tensorflow::Node*> in_nodes_que, out_nodes_que;
+      // Find an input node that is not eligible and add it to the queue.
+      // Nodes that has no incoming edges should not be treated as "input",
+      // as there are really no inputs to them. Similar for output nodes.
+      for (auto node : segment_nodes) {
+        bool added = false;
+        for (const tensorflow::Edge* edge : node->in_edges()) {
+          if (!edge->IsControlEdge() && !edge->src()->IsSource() &&
+              !segment_nodes.count(edge->src())) {  // 'node' is an input node.
+            if (!input_candidate_fn(edge)) {
+              in_nodes_que.push_back(node);
+              added = true;
+              break;
+            }
+          }
+        }
+        if (added) continue;  // Only adding the node once to either queue.
+        for (const tensorflow::Edge* edge : node->out_edges()) {
+          if (!edge->dst()->IsSink() && !edge->IsControlEdge() &&
+              !segment_nodes.count(edge->dst())) {  // 'node' is an output node.
+            if (!output_candidate_fn(edge)) {
+              out_nodes_que.push_back(node);
+              break;
+            }
+          }
+        }
+      }
+      if (in_nodes_que.empty() && out_nodes_que.empty()) {
+        // No more ineligible input/output nodes.
+        break;
+      }
+      // Now for each ineligible node, remove all of its inputs or outputs from
+      // the subgraph.
+      //
+      // It can be proven that, if the original subgraph:
+      // 1. is a DAG, and
+      // 2. all paths between two nodes in the subgraph are all inside the
+      //    subgraph
+      // then after doing this operation the resulting subgraph will keep the
+      // same properties 1 and 2.
+      //
+      // For simplicity we use heuristics: for input nodes remove all its
+      // input, for output nodes remove all its output. In this way, for common
+      // cases the number of removed nodes should be minimum.
+      auto remove_nodes = [&segment_nodes](
+                              bool is_input_nodes,
+                              std::deque<const tensorflow::Node*>* que) {
+        // Run a BFS on the queue to find all the input/output nodes.
+        std::set<const tensorflow::Node*> visited;
+        while (!que->empty()) {
+          auto node = que->front();
+          que->pop_front();
+          if (!visited.insert(node).second) continue;
+          segment_nodes.erase(node);
+          for (auto in :
+               is_input_nodes ? node->in_nodes() : node->out_nodes()) {
+            if (segment_nodes.count(in)) {
+              que->push_back(in);
+              VLOG(2) << "Need to remove node " << in->name()
+                      << " because one of its "
+                      << (is_input_nodes ? "output" : "input")
+                      << " nodes in the graph was removed: " << node->name();
+            }
+          }
+        }
+      };
+      remove_nodes(true, &in_nodes_que);
+      remove_nodes(false, &out_nodes_que);
+    }
+    VLOG(1) << "Segment new size: " << segment_nodes.size();
+  }
+
+  // --------------------------------- Step 3 ---------------------------------
   // Convert the segments into the expected return format
   for (const auto& itr : sg_map) {
-    const auto& segment_node_names = itr.second;
+    const std::set<const tensorflow::Node*>& segment_nodes = itr.second;
     if (VLOG_IS_ON(1)) {
       string s;
-      for (const auto& name : segment_node_names) {
-        s += " " + name;
-      }
-      VLOG(1) << "Segment " << segments->size() << ":" << s;
+      for (auto node : segment_nodes) s += " " + node->name();
+      VLOG(1) << "Segment " << segments->size() << ": " << s;
     }
 
     // Don't use small segments.
-    if (static_cast<int>(segment_node_names.size()) <
-        options.minimum_segment_size) {
+    if (static_cast<int>(segment_nodes.size()) < options.minimum_segment_size) {
       VLOG(1) << "Segment " << segments->size() << " has only "
-              << segment_node_names.size() << " nodes, dropping";
+              << segment_nodes.size() << " nodes, dropping";
       continue;
     }
+
     // TODO(sami): Make segmenter placement aware once trtscopes are in place
+    std::set<string> segment_node_names;
+    for (auto node : itr.second) segment_node_names.insert(node->name());
     const auto& dev_itr = device_maps.find(itr.first);
     if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
       VLOG(1) << "No device assigned to segment " << segments->size();
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 81b4bfe49fe375d19f4c7811459f38e25d2edea8..8c44eb782aa37052680d0e06023f29dc65e327c6 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -40,22 +40,6 @@ struct SegmentOptions {
   std::set<string> exclude_node_list;
 };
 
-// Get the subgraphs of a graph that can be handled by TensorRT.
-//
-// @param gdef The GraphDef describing the network
-// @param candidate_fn A function that returns true for a NodeDef if
-// that node can be handled by TensorRT.
-// @param segments Returns the TensorRT segments/subgraphs. Each entry
-// in the vector describes a subgraph by giving a set of the names of
-// all the NodeDefs in that subgraph.
-// @return the status.
-//
-// TODO(aaroey): remove this method.
-tensorflow::Status SegmentGraph(
-    const tensorflow::GraphDef& gdef,
-    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
-    const SegmentOptions& options, SegmentNodesVector* segments);
-
 // Get the subgraphs of a graph that can be handled by TensorRT.
 //
 // @param graph tensorflow::Graph of the network
@@ -66,8 +50,10 @@ tensorflow::Status SegmentGraph(
 // all the NodeDefs in that subgraph.
 // @return the status.
 tensorflow::Status SegmentGraph(
-    tensorflow::Graph* tf_graph,
+    const tensorflow::Graph* tf_graph,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& input_candidate_fn,
+    const std::function<bool(const tensorflow::Edge*)>& output_candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
 }  // namespace segment
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index f5b2d258d70d5577a9d68f2d9f6d6e678ede97ce..432e7b1c047cb3b22d47f7432b6aad639a3a3b2d 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -14,350 +14,245 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
-#include "tensorflow/c/c_api.h"
-#include "tensorflow/core/framework/graph.pb.h"
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/graph/testlib.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 namespace test {
+namespace ops = ::tensorflow::ops;
 
 class SegmentTest : public ::testing::Test {
- public:
-  bool GetGraphDef(TF_Graph* graph, tensorflow::GraphDef* graph_def);
-
-  TF_Operation* Placeholder(TF_Graph* graph, TF_Status* s, const char* name);
-  TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                    TF_Status* s, const char* name);
-
-  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
-      const std::set<string>& node_names);
-
  protected:
-  void PlaceholderHelper(TF_Graph* graph, TF_Status* s, const char* name,
-                         TF_Operation** op);
-  void AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                 TF_Status* s, const char* name, TF_Operation** op, bool check);
-
-  SegmentOptions default_options_;
-};
-
-bool SegmentTest::GetGraphDef(TF_Graph* graph,
-                              tensorflow::GraphDef* graph_def) {
-  TF_Status* s = TF_NewStatus();
-  TF_Buffer* buffer = TF_NewBuffer();
-  TF_GraphToGraphDef(graph, buffer, s);
-  bool ret = TF_GetCode(s) == TF_OK;
-  EXPECT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  if (ret) ret = graph_def->ParseFromArray(buffer->data, buffer->length);
-  TF_DeleteBuffer(buffer);
-  TF_DeleteStatus(s);
-  return ret;
-}
+  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Node* node) -> bool {
+      return node_names.find(node->name()) != node_names.end();
+    };
+  }
 
-std::function<bool(const tensorflow::Node*)> SegmentTest::MakeCandidateFn(
-    const std::set<string>& node_names) {
-  return [node_names](const tensorflow::Node* node) -> bool {
-    return node_names.find(node->name()) != node_names.end();
-  };
-}
+  std::function<bool(const tensorflow::Edge*)> MakeInputEdgeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Edge* in_edge) -> bool {
+      return node_names.find(in_edge->dst()->name()) != node_names.end();
+    };
+  }
 
-void SegmentTest::PlaceholderHelper(TF_Graph* graph, TF_Status* s,
-                                    const char* name, TF_Operation** op) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "Placeholder", name);
-  TF_SetAttrType(desc, "dtype", TF_INT32);
-  *op = TF_FinishOperation(desc, s);
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  ASSERT_NE(*op, nullptr);
-}
+  std::function<bool(const tensorflow::Edge*)> MakeOutputEdgeCandidateFn(
+      const std::set<string>& node_names) {
+    return [node_names](const tensorflow::Edge* out_edge) -> bool {
+      return node_names.find(out_edge->src()->name()) != node_names.end();
+    };
+  }
 
-TF_Operation* SegmentTest::Placeholder(TF_Graph* graph, TF_Status* s,
-                                       const char* name) {
-  TF_Operation* op;
-  PlaceholderHelper(graph, s, name, &op);
-  return op;
-}
+  void RunTest(const tensorflow::Graph* graph,
+               const std::set<string>& candidates,
+               const std::set<string>& input_candidates,
+               const std::set<string>& output_candidates,
+               const std::vector<std::set<string>>& expected_segments) {
+    SegmentNodesVector segments;
+    TF_EXPECT_OK(SegmentGraph(graph, MakeCandidateFn(candidates),
+                              MakeInputEdgeCandidateFn(input_candidates),
+                              MakeOutputEdgeCandidateFn(output_candidates),
+                              default_options_, &segments));
+    ValidateSegment(segments, expected_segments);
+  }
 
-void SegmentTest::AddHelper(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
-                            TF_Status* s, const char* name, TF_Operation** op,
-                            bool check) {
-  TF_OperationDescription* desc = TF_NewOperation(graph, "AddN", name);
-  TF_Output add_inputs[2] = {{l, 0}, {r, 0}};
-  TF_AddInputList(desc, add_inputs, 2);
-  *op = TF_FinishOperation(desc, s);
-  if (check) {
-    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-    ASSERT_NE(*op, nullptr);
+  void ValidateSegment(const SegmentNodesVector& segments,
+                       const std::vector<std::set<string>>& expected_segments) {
+    EXPECT_EQ(expected_segments.size(), segments.size());
+    for (int i = 0; i < segments.size(); ++i) {
+      const auto& segment_node_names = segments[i].first;
+      const auto& expected = expected_segments[i];
+      for (const auto& name : expected) {
+        EXPECT_TRUE(segment_node_names.count(name))
+            << "Segment " << i << " is missing expected node: " << name;
+      }
+      if (segment_node_names.size() == expected.size()) continue;
+      for (const auto& name : segment_node_names) {
+        EXPECT_TRUE(expected.count(name))
+            << "Unexpected node found in segment " << i << ": " << name;
+      }
+    }
   }
-}
 
-TF_Operation* SegmentTest::Add(TF_Operation* l, TF_Operation* r,
-                               TF_Graph* graph, TF_Status* s,
-                               const char* name) {
-  TF_Operation* op;
-  AddHelper(l, r, graph, s, name, &op, true);
-  return op;
+  SegmentOptions default_options_;
+};
+
+std::set<string> operator-(const std::set<string>& lhs, const string& rhs) {
+  std::set<string> result = lhs;
+  CHECK(result.erase(rhs));
+  return result;
 }
 
 TEST_F(SegmentTest, Empty) {
-  TF_Graph* graph = TF_NewGraph();
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({}), default_options_, &segments),
-      tensorflow::Status::OK());
-
+  Scope s = Scope::NewRootScope();
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
   // Expect no segments/subgraphs.
-  EXPECT_TRUE(segments.empty());
-  TF_DeleteGraph(graph);
+  RunTest(&g, {}, {}, {}, {});
 }
 
 TEST_F(SegmentTest, Simple) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
   //           feed
-  //         //    ||
+  //          //  \\
   //       add0    add1
-  //        | |    /
+  //        | \    /
   //        |  add2
-  //        |  /  ||
+  //        | /   \\
   //       add3    add4
-  //           |  /
+  //          \    /
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def,
-                   MakeCandidateFn({"add0", "add1", "add2", "add3", "add4"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
-
-  // Expect all Add operations to be collapsed into a single segment
-  ASSERT_EQ(segments.size(), 1);
-  std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
-  for (const auto& ex : expected) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // All Add operations are candidates, and we expect all of them to be
+  // collapsed into a single segment
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4"};
+  RunTest(&g, all_adds, all_adds, all_adds, {all_adds});
+
+  // Make add1 not a candidate, and we expect all other Add operations to be
+  // collapsed into a single segment
+  auto without_add1 = all_adds - "add1";
+  RunTest(&g, without_add1, without_add1, without_add1, {without_add1});
+
+  // Make add1 not a candidate and add2 not an input candidate, and we expect
+  // add0 and add2 are removed from the segment.
+  auto without_add2 = all_adds - "add2";
+  RunTest(&g, without_add1, without_add2, without_add1, {{"add3", "add4"}});
+
+  // Making add2 not an input candidate itself won't affect anything.
+  RunTest(&g, all_adds, without_add2, all_adds, {all_adds});
+
+  // Making add1 not an input candidate.
+  RunTest(&g, all_adds, without_add1, all_adds, {without_add1});
+
+  // Making add3 not an output candidate doesn't affect anything, since it's
+  // output is sink.
+  auto without_add3 = all_adds - "add3";
+  RunTest(&g, all_adds, all_adds, without_add3, {all_adds});
 }
 
 TEST_F(SegmentTest, AvoidCycle) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add2 is not a TRT candidate so add0/add3 cannot be formed as a
-  // subgraph
-  //
   //           feed
-  //         //    ||
+  //          //  \\
   //       add0    add1
-  //        | |    /
+  //        | \    /
   //        |  add2
-  //        |  /  ||
+  //        |  /  \\
   //       add3    add4
-  //           |  /
+  //          \    /
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add2, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(
-      SegmentGraph(graph_def, MakeCandidateFn({"add0", "add1", "add3", "add4"}),
-                   default_options_, &segments),
-      tensorflow::Status::OK());
-
-  // Expect no subgraphs
-  EXPECT_EQ(segments.size(), 0);
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add2);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // add2 is not a TRT candidate so there should be no segments generated.
+  const std::set<string> without_add2 = {"add0", "add1", "add3", "add4"};
+  RunTest(&g, without_add2, without_add2, without_add2, {});
 }
 
 TEST_F(SegmentTest, Multiple) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add5 is not a TRT candidate so two subgraphs should be formed
-  //
-  //                feed
-  //         //      ||     ||
-  //       add0    add1      add7
-  //        | |    /        /   ||
-  //        |  add2-----add5    add8
-  //        |  /  |    |  |    |
-  //       add3   add4     add6
-  //           |     |     /
-  //               <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(feed, feed, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add7 = Add(feed, feed, graph, s, "add7");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add0, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add5 = Add(add2, add7, graph, s, "add5");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add8 = Add(add7, add7, graph, s, "add8");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add0, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add3"), string(TF_OperationName(add3)));
-  TF_Operation* add4 = Add(add2, add5, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add4"), string(TF_OperationName(add4)));
-  TF_Operation* add6 = Add(add5, add8, graph, s, "add6");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add6"), string(TF_OperationName(add6)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add2", "add3",
-                                          "add4", "add6", "add7", "add8"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
-
-  // Expect two subgraphs
-  EXPECT_EQ(segments.size(), 2);
-
-  std::vector<string> expected0{"add6", "add8"};
-  for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-
-  std::vector<string> expected1{"add0", "add1", "add2", "add3"};
-  for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  //              feed
+  //           //  ||  \\
+  //        add0  add1  add7
+  //        |  \  /     / \\
+  //        |  add2    /   \\
+  //        |   || \   |   ||
+  //        |   ||  add5  add8
+  //        |  /  \ /  \   /
+  //        add3  add4  add6
+  //           \   |   /
+  //             <sink>
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), feed, feed);
+  auto add7 = ops::Add(s.WithOpName("add7"), feed, feed);
+  auto add2 = ops::Add(s.WithOpName("add2"), add0, add1);
+  auto add5 = ops::Add(s.WithOpName("add5"), add2, add7);
+  auto add8 = ops::Add(s.WithOpName("add8"), add7, add7);
+  auto add3 = ops::Add(s.WithOpName("add3"), add0, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add2, add5);
+  auto add6 = ops::Add(s.WithOpName("add6"), add5, add8);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3", "add4",
+                                     "add5", "add6", "add7", "add8"};
+  // Make add5 not a TRT candidate, and we expect two segments.
+  auto without_add5 = all_adds - "add5";
+  RunTest(&g, without_add5, without_add5, without_add5,
+          {{"add6", "add8"}, {"add0", "add1", "add2", "add3"}});
+
+  // Make add8 not a candidate and add6 not an input candidate, then all direct
+  // and indirect inputs of add6 will be removed from the segment.
+  auto without_add8 = all_adds - "add8";
+  auto without_add6 = all_adds - "add6";
+  RunTest(&g, without_add8, without_add6, all_adds, {{"add3", "add4"}});
+
+  // Make add3 not a candidate and add0 not an output candidate, then all
+  // direct and indirect outputs of add0 will be removed from the segment.
+  auto without_add3 = all_adds - "add3";
+  auto without_add0 = all_adds - "add0";
+  RunTest(&g, without_add3, all_adds, without_add0, {{"add1", "add7", "add8"}});
 }
 
 TEST_F(SegmentTest, BigIfElse) {
-  TF_Status* s = TF_NewStatus();
-  TF_Graph* graph = TF_NewGraph();
-
-  // add2 is not a TRT candidate
-  //
   //           feed
   //            ||
   //           add0
-  //         //    ||
+  //         //    \\
   //       add1    add4
   //        ||      ||
   //       add2    add5
   //        ||      ||
   //       add3    add6
-  //         ||    //
+  //         \\    //
   //           add7
   //            ||
   //          <sink>
-  //
-  TF_Operation* feed = Placeholder(graph, s, "feed");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("feed"), string(TF_OperationName(feed)));
-
-  TF_Operation* add0 = Add(feed, feed, graph, s, "add0");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add1 = Add(add0, add0, graph, s, "add1");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add2 = Add(add1, add1, graph, s, "add2");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add3 = Add(add2, add2, graph, s, "add3");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add4 = Add(add0, add0, graph, s, "add4");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add5 = Add(add4, add4, graph, s, "add5");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add6 = Add(add5, add5, graph, s, "add6");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  TF_Operation* add7 = Add(add3, add6, graph, s, "add7");
-  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
-  EXPECT_EQ(string("add7"), string(TF_OperationName(add7)));
-
-  GraphDef graph_def;
-  ASSERT_TRUE(GetGraphDef(graph, &graph_def));
-
-  SegmentNodesVector segments;
-  ASSERT_EQ(SegmentGraph(graph_def,
-                         MakeCandidateFn({"add0", "add1", "add3", "add4",
-                                          "add5", "add6", "add7"}),
-                         default_options_, &segments),
-            tensorflow::Status::OK());
-
-  // Expect 2 subgraphs
-  EXPECT_EQ(segments.size(), 2);
-
-  std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
-  for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
-        << "Missing expected node " << ex;
-  }
-
-  std::vector<string> expected1{"add0", "add1"};
-  for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
-        << "Missing expected node " << ex;
-  }
-  TF_DeleteGraph(graph);
-  TF_DeleteStatus(s);
+  Scope s = Scope::NewRootScope();
+  auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT);
+  auto add0 = ops::Add(s.WithOpName("add0"), feed, feed);
+  auto add1 = ops::Add(s.WithOpName("add1"), add0, add0);
+  auto add2 = ops::Add(s.WithOpName("add2"), add1, add1);
+  auto add3 = ops::Add(s.WithOpName("add3"), add2, add2);
+  auto add4 = ops::Add(s.WithOpName("add4"), add0, add0);
+  auto add5 = ops::Add(s.WithOpName("add5"), add4, add4);
+  auto add6 = ops::Add(s.WithOpName("add6"), add5, add5);
+  auto add7 = ops::Add(s.WithOpName("add7"), add3, add6);
+  tensorflow::Graph g(OpRegistry::Global());
+  TF_EXPECT_OK(s.ToGraph(&g));
+
+  // Make add2 not a TRT candidate, and we expect 2 segments.
+  const std::set<string> all_adds = {"add0", "add1", "add2", "add3",
+                                     "add4", "add5", "add6", "add7"};
+  RunTest(&g, all_adds - "add2", all_adds, all_adds,
+          {{"add3", "add4", "add5", "add6", "add7"}, {"add0", "add1"}});
 }
 
 }  // namespace test
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
index 3712a9a6fe349d949ef2666652b9d750538d5535..769982c6456f76663e50fe3ec59651127e3720ac 100644
--- a/tensorflow/contrib/tensorrt/tensorrt_test.cc
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 
 #if GOOGLE_CUDA
@@ -130,6 +132,13 @@ void Execute(nvinfer1::IExecutionContext* context, const float* input,
 }
 
 TEST(TensorrtTest, BasicFunctions) {
+  // Handle the case where the test is run on machine with no gpu available.
+  if (CHECK_NOTNULL(GPUMachineManager())->VisibleDeviceCount() <= 0) {
+    LOG(WARNING) << "No gpu device available, probably not being run on a gpu "
+                    "machine. Skipping...";
+    return;
+  }
+
   // Create the network model.
   nvinfer1::IHostMemory* model = CreateNetwork();
   // Use the model to create an engine and then an execution context.
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..edd30ad7a95dd3c7f74634699660caad30c0b645
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -0,0 +1,126 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Basic tests for TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class SimpleSingleEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment."""
+    # TODO(aaroey): test graph with different dtypes.
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        conv_filter = constant_op.constant(
+            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+            name="weights",
+            dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=conv_filter,
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            name="conv")
+        bias = constant_op.constant(
+            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        added = nn.bias_add(conv, bias, name="bias_add")
+        relu = nn.relu(added, "relu")
+        identity = array_ops.identity(relu, "identity")
+        pool = nn_ops.max_pool(
+            identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      array_ops.squeeze(pool, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(100, 6, 6, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+class SimpleMultiEngineGraphDefTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing multiple segment."""
+    # TODO(aaroey): test graph with different dtypes.
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [100, 24, 24, 2]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        conv_filter = constant_op.constant(
+            [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
+            name="weights",
+            dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=conv_filter,
+            strides=[1, 2, 2, 1],
+            padding="SAME",
+            name="conv")
+        c1 = constant_op.constant(
+            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype)
+        p = conv * c1
+        c2 = constant_op.constant(
+            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype)
+        q = conv / c2
+
+        edge = self.trt_incompatible_op(q)
+        edge /= edge
+        r = edge + edge
+
+        p -= edge
+        q *= edge
+        s = p + q
+        s -= r
+      array_ops.squeeze(s, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(100, 12, 12, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+# TODO(aaroey): add a large complex graph to test.
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..730b6843fb9885b8ba0db2ad199b95d9d3219774
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BatchMatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [12, 5, 8, 12]
+    w1_name = "matmul_w1"
+    w1_dims = [12, 5, 12, 7]
+    w2_name = "matmul_w2"
+    w2_dims = [12, 12, 7]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      w1 = array_ops.placeholder(dtype=dtype, shape=w1_dims, name=w1_name)
+      w2 = array_ops.placeholder(dtype=dtype, shape=w2_dims, name=w2_name)
+      with g.device("/GPU:0"):
+        b = constant_op.constant(np.random.randn(12, 5, 12, 7), dtype=dtype)
+        c = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+        d = constant_op.constant(np.random.randn(5, 1, 1), dtype=dtype)
+        x1 = math_ops.matmul(inp, b)
+        x1 = x1 + c
+        x2 = math_ops.matmul(inp, w1)
+        x2 = x2 * d
+        e = gen_array_ops.reshape(inp, [12, 40, 12])
+        x3 = math_ops.matmul(e, w2)
+        f = constant_op.constant(np.random.randn(40, 1), dtype=dtype)
+        x3 = x3 + f
+        x3 = gen_array_ops.reshape(x3, [12, 5, 8, 7])
+        out = x1 + x2 + x3
+      array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name, w1_name, w2_name],
+        input_dims=[input_dims, w1_dims, w2_dims],
+        num_expected_engines=1,
+        expected_output_dims=(12, 5, 8, 7),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c03a10b640c8b243318bb4327d2ac5aac803be7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -0,0 +1,112 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BiasAdd MatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [48, 12]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+
+      b = constant_op.constant(np.random.randn(12, 4), dtype=dtype)
+      x1 = math_ops.matmul(x, b)
+      b = constant_op.constant(np.random.randn(1, 4), dtype=dtype)
+      x1 = x1 + b
+
+      b = constant_op.constant(np.random.randn(48, 4), dtype=dtype)
+      x2 = math_ops.matmul(x, b, transpose_a=True)
+      x2 = gen_array_ops.reshape(x2, [48, 1])
+
+      b = constant_op.constant(np.random.randn(4, 12), dtype=dtype)
+      x3 = math_ops.matmul(x, b, transpose_b=True)
+
+      b = constant_op.constant(np.random.randn(16, 48), dtype=dtype)
+      x4 = math_ops.matmul(x, b, transpose_b=True, transpose_a=True)
+      x4 = gen_array_ops.reshape(x4, [48, 4])
+
+      x5 = gen_array_ops.reshape(x, [4, 144])
+      b = constant_op.constant(np.random.randn(144, 48), dtype=dtype)
+      x5 = math_ops.matmul(x5, b)
+      b = constant_op.constant(np.random.randn(48), dtype=dtype)
+      x5 = nn.bias_add(x5, b)
+      x5 = gen_array_ops.reshape(x5, [48, 4])
+
+      x6 = gen_array_ops.reshape(x, [4, 12, 12])
+      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x6 = nn.bias_add(x6, b, data_format="NHWC")
+      x6 = gen_array_ops.reshape(x6, [48, -1])
+
+      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
+      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x7 = nn.bias_add(x7, b, data_format="NHWC")
+      x7 = gen_array_ops.reshape(x7, [48, -1])
+
+      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
+      b = constant_op.constant(np.random.randn(2), dtype=dtype)
+      x8 = nn.bias_add(x8, b, data_format="NHWC")
+      x8 = gen_array_ops.reshape(x8, [48, -1])
+
+      x9 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
+      b = constant_op.constant(np.random.randn(3), dtype=dtype)
+      x9 = nn.bias_add(x9, b, data_format="NCHW")
+      x9 = gen_array_ops.reshape(x9, [48, -1])
+
+      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
+      b = constant_op.constant(np.random.randn(12), dtype=dtype)
+      x10 = nn.bias_add(x10, b, data_format="NCHW")
+      x10 = gen_array_ops.reshape(x10, [48, -1])
+
+      x11 = gen_array_ops.reshape(x, [4, 12, 12])
+      b = constant_op.constant(np.random.randn(4), dtype=dtype)
+      x11 = nn.bias_add(x11, b, data_format="NCHW")
+      x11 = gen_array_ops.reshape(x11, [48, -1])
+
+      out = array_ops.concat(
+          [x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11], axis=-1)
+      out = array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=7,
+        expected_output_dims=(48, 89),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd673463a5930df4d0e4c1c7410b3f5eb88d664c
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -0,0 +1,119 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Tests for scale & elementwise layers in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [10, 24, 24, 20]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      # scale
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 1, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # scale
+      a = constant_op.constant(np.random.randn(24, 24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 1, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 1), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(1, 24, 24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
+      f = a + x
+      x = math_ops.sigmoid(f)
+      # elementwise
+      a = constant_op.constant(np.random.randn(24, 20), dtype=dtype)
+      f = x + a
+      x = math_ops.sigmoid(f)
+      gen_array_ops.reshape(x, [5, -1], name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=16,
+        expected_output_dims=(5, 23040),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c51c45b0a2c6f370415b9c8ac99a63dd37be900
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.platform import test
+
+
+class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing Concatenation in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 3, 1]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      # scale
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r1 = x / a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r2 = a / x
+      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+      r3 = a + x
+      a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype)
+      r4 = x * a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r5 = x - a
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r6 = a - x
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r7 = x - a
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r8 = a - x
+      a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype)
+      r9 = gen_math_ops.maximum(x, a)
+      a = constant_op.constant(np.random.randn(3, 1), dtype=dtype)
+      r10 = gen_math_ops.minimum(a, x)
+      a = constant_op.constant(np.random.randn(3), dtype=dtype)
+      r11 = x * a
+      a = constant_op.constant(np.random.randn(1), dtype=dtype)
+      r12 = a * x
+      concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1)
+      concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3)
+      x = array_ops.concat([concat1, concat2], axis=-1)
+      gen_array_ops.reshape(x, [2, -1], name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(2, 126),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..97b29bf05ddc3a0396472d0500ff53ceca7c5d4b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -0,0 +1,68 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for Constant broadcasting in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = 'input'
+    input_dims = [5, 12, 12, 2]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      filt1 = constant_op.constant(
+          0.3, shape=(3, 3, 2, 1), dtype=dtype, name='filt1')
+      y1 = nn.conv2d(x, filt1, strides=[1, 1, 1, 1], padding='SAME', name='y1')
+      z1 = nn.relu(y1, name='z1')
+      filt2 = constant_op.constant(
+          np.random.randn(9), shape=(3, 3, 1, 1), dtype=dtype, name='filt2')
+      y2 = nn.conv2d(z1, filt2, strides=[1, 1, 1, 1], padding='SAME', name='y2')
+      z2 = nn.relu(y2, name='z')
+      filt3 = constant_op.constant(
+          np.random.randn(3, 3, 1, 1),
+          shape=(3, 3, 1, 1),
+          dtype=dtype,
+          name='filt3')
+      y3 = nn.conv2d(z2, filt3, strides=[1, 1, 1, 1], padding='SAME', name='y3')
+      nn.relu(y3, name='output')
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 12, 12, 1),
+        allclose_atol=1.e-02,
+        allclose_rtol=1.e-02)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd95c6f62fe504cb23e01fdb8b9785cee080de4
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -0,0 +1,72 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Testing conversion of BatchMatMul in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 15, 15, 3]
+    g = ops.Graph()
+    with g.as_default():
+      inp = array_ops.placeholder(
+          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+      with g.device("/GPU:0"):
+        e1 = constant_op.constant(
+            np.random.randn(1, 1, 3, 5), name="kernel_1", dtype=dtype)
+        e2 = constant_op.constant(
+            np.random.randn(1, 1, 5, 10), name="kernel_2", dtype=dtype)
+        conv = nn.conv2d(
+            input=inp,
+            filter=e1,
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            name="conv")
+        out = nn.conv2d(
+            input=conv,
+            filter=e2,
+            strides=[1, 1, 1, 1],
+            padding="VALID",
+            name="conv_2")
+      array_ops.squeeze(out, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(2, 15, 15, 10),
+        allclose_atol=1.e-02,
+        allclose_rtol=1.e-02)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..734ccf6345777d543138daba2b720c9dc03f3295
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -0,0 +1,87 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for multi connection neighboring nodes wiring tests in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 7, 5]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      e = constant_op.constant(
+          np.random.normal(.05, .005, [3, 2, 3, 4]),
+          name="weights",
+          dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 1, 1],
+          padding="VALID",
+          name="conv")
+      b = constant_op.constant(
+          np.random.normal(2.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      t = conv + b
+
+      b = constant_op.constant(
+          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      q = conv - b
+      edge = math_ops.sigmoid(q)
+
+      b = constant_op.constant(
+          np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      d = b + conv
+      edge3 = math_ops.sigmoid(d)
+
+      edge1 = gen_math_ops.tan(conv)
+      t = t - edge1
+      q = q + edge
+      t = t + q
+      t = t + d
+      t = t - edge3
+      array_ops.squeeze(t, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(2, 4, 5, 4),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..50265c0845005748d75bf8afc49df11a528c9169
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
+
+
+class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Neighboring node wiring tests in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [2, 3, 7, 5]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      e = constant_op.constant(
+          np.random.normal(.3, 0.05, [3, 2, 3, 4]), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 1, 1],
+          padding="VALID",
+          name="conv")
+      b = constant_op.constant(
+          np.random.normal(1.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
+      t = conv * b
+      e = gen_math_ops.tan(conv)
+      t = t - e
+      array_ops.squeeze(t, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=2,
+        expected_output_dims=(2, 4, 5, 4),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
deleted file mode 100644
index 035b112254e8cb6714443790f65bb611375b64a7..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Script to test TF-TensorRT integration."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import namedtuple
-import itertools
-import warnings
-import numpy as np
-import six
-
-from tensorflow.contrib import tensorrt as trt
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import importer
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import nn_ops
-from tensorflow.python.platform import test
-
-INPUT_NAME = "input"
-OUTPUT_NAME = "output"
-INPUT_DIMS = [100, 24, 24, 2]
-MODE_FP32 = "FP32"
-MODE_FP16 = "FP16"
-MODE_INT8 = "INT8"
-
-if six.PY2:
-  to_bytes = lambda s: s
-  to_string = lambda s: s
-else:
-  to_bytes = lambda s: s.encode("utf-8", errors="surrogateescape")
-  to_string = lambda s: s.decode("utf-8")
-
-
-# TODO(aaroey): test graph with different dtypes.
-def GetSingleEngineGraphDef(dtype=dtypes.float32):
-  """Create a graph containing single segment."""
-  g = ops.Graph()
-  with g.as_default():
-    inp = array_ops.placeholder(
-        dtype=dtype, shape=[None] + INPUT_DIMS[1:], name=INPUT_NAME)
-    with g.device("/GPU:0"):
-      conv_filter = constant_op.constant(
-          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-          name="weights",
-          dtype=dtype)
-      conv = nn.conv2d(
-          input=inp,
-          filter=conv_filter,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
-      bias = constant_op.constant(
-          [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
-      added = nn.bias_add(conv, bias, name="bias_add")
-      relu = nn.relu(added, "relu")
-      identity = array_ops.identity(relu, "identity")
-      pool = nn_ops.max_pool(
-          identity, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
-    array_ops.squeeze(pool, name=OUTPUT_NAME)
-  return g.as_graph_def()
-
-
-# TODO(aaroey): test graph with different dtypes.
-def GetMultiEngineGraphDef(dtype=dtypes.float32):
-  """Create a graph containing multiple segment."""
-  g = ops.Graph()
-  with g.as_default():
-    inp = array_ops.placeholder(
-        dtype=dtype, shape=[None] + INPUT_DIMS[1:], name=INPUT_NAME)
-    with g.device("/GPU:0"):
-      conv_filter = constant_op.constant(
-          [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
-          name="weights",
-          dtype=dtype)
-      conv = nn.conv2d(
-          input=inp,
-          filter=conv_filter,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
-      c1 = constant_op.constant(
-          np.random.randn(INPUT_DIMS[0], 12, 12, 6), dtype=dtype)
-      p = conv * c1
-      c2 = constant_op.constant(
-          np.random.randn(INPUT_DIMS[0], 12, 12, 6), dtype=dtype)
-      q = conv / c2
-
-      edge = math_ops.sin(q)
-      edge /= edge
-      r = edge + edge
-
-      p -= edge
-      q *= edge
-      s = p + q
-      s -= r
-    array_ops.squeeze(s, name=OUTPUT_NAME)
-  return g.as_graph_def()
-
-
-TestGraph = namedtuple("TestGraph",
-                       ["gdef", "num_expected_engines", "expected_output_dims"])
-
-TEST_GRAPHS = {
-    "SingleEngineGraph":
-        TestGraph(
-            gdef=GetSingleEngineGraphDef(),
-            num_expected_engines=1,
-            expected_output_dims=(100, 6, 6, 6)),
-    "MultiEngineGraph":
-        TestGraph(
-            gdef=GetMultiEngineGraphDef(),
-            num_expected_engines=2,
-            expected_output_dims=(100, 12, 12, 6)),
-    # TODO(aaroey): add a large complex graph to test.
-}
-
-
-class TfTrtIntegrationTest(test_util.TensorFlowTestCase):
-  """Class to test Tensorflow-TensorRT integration."""
-
-  def setUp(self):
-    """Setup method."""
-    super(TfTrtIntegrationTest, self).setUp()
-    warnings.simplefilter("always")
-    self._input = np.random.random_sample(INPUT_DIMS)
-
-  def _GetConfigProto(self,
-                      use_optimizer,
-                      precision_mode=None,
-                      is_dynamic_op=None):
-    if use_optimizer:
-      rewriter_cfg = rewriter_config_pb2.RewriterConfig()
-      rewriter_cfg.optimizers.extend(["constfold", "layout"])
-      custom_op = rewriter_cfg.custom_optimizers.add()
-      custom_op.name = "TensorRTOptimizer"
-      custom_op.parameter_map["minimum_segment_size"].i = 3
-      custom_op.parameter_map["max_batch_size"].i = self._input.shape[0]
-      custom_op.parameter_map["is_dynamic_op"].b = is_dynamic_op
-      custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
-      custom_op.parameter_map["precision_mode"].s = to_bytes(precision_mode)
-      graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
-    else:
-      graph_options = config_pb2.GraphOptions()
-
-    gpu_options = config_pb2.GPUOptions()
-    if trt.trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
-    config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
-    return config
-
-  def _RunGraph(self, graph_key, gdef, input_data, config, num_runs=2):
-    """Run given graphdef multiple times."""
-    g = ops.Graph()
-    with g.as_default():
-      inp, out = importer.import_graph_def(
-          graph_def=gdef, return_elements=[INPUT_NAME, OUTPUT_NAME], name="")
-      inp = inp.outputs[0]
-      out = out.outputs[0]
-    with self.test_session(
-        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
-      val = None
-      # Defaults to 2 runs to verify result across multiple runs is same.
-      for _ in range(num_runs):
-        new_val = sess.run(out, {inp: input_data})
-        self.assertEqual(TEST_GRAPHS[graph_key].expected_output_dims,
-                         new_val.shape)
-        if val is not None:
-          self.assertAllEqual(new_val, val)
-        val = new_val
-    return val
-
-  # Use real data that is representative of the inference dataset
-  # for calibration. For this test script it is random data.
-  def _RunCalibration(self, graph_key, gdef, input_data, config):
-    """Run calibration on given graph."""
-    return self._RunGraph(graph_key, gdef, input_data, config, 30)
-
-  def _GetTrtGraph(self, gdef, precision_mode, is_dynamic_op):
-    """Return trt converted graph."""
-    return trt.create_inference_graph(
-        input_graph_def=gdef,
-        outputs=[OUTPUT_NAME],
-        max_batch_size=self._input.shape[0],
-        max_workspace_size_bytes=1 << 25,
-        precision_mode=precision_mode,
-        minimum_segment_size=2,
-        is_dynamic_op=is_dynamic_op)
-
-  def _VerifyGraphDef(self,
-                      graph_key,
-                      gdef,
-                      precision_mode=None,
-                      is_calibrated=None,
-                      dynamic_engine=None):
-    num_engines = 0
-    for n in gdef.node:
-      if n.op == "TRTEngineOp":
-        num_engines += 1
-        self.assertNotEqual(to_bytes(""), n.attr["serialized_segment"].s)
-        self.assertNotEqual(to_bytes(""), n.attr["segment_funcdef_name"].s)
-        self.assertEqual(n.attr["precision_mode"].s, to_bytes(precision_mode))
-        self.assertEqual(n.attr["static_engine"].b, not dynamic_engine)
-        if precision_mode == MODE_INT8 and is_calibrated:
-          self.assertNotEqual(to_bytes(""), n.attr["calibration_data"].s)
-        else:
-          self.assertEqual(to_bytes(""), n.attr["calibration_data"].s)
-    if precision_mode is None:
-      self.assertEqual(num_engines, 0)
-    else:
-      self.assertEqual(num_engines,
-                       TEST_GRAPHS[graph_key].num_expected_engines)
-
-  def _RunTest(self, graph_key, use_optimizer, precision_mode,
-               dynamic_infer_engine, dynamic_calib_engine):
-    assert precision_mode in [MODE_FP32, MODE_FP16, MODE_INT8]
-    input_gdef = TEST_GRAPHS[graph_key].gdef
-    self._VerifyGraphDef(graph_key, input_gdef)
-
-    # Get reference result without running trt.
-    config_no_trt = self._GetConfigProto(False)
-    print("Running original graph w/o trt, config:\n%s" % str(config_no_trt))
-    ref_result = self._RunGraph(graph_key, input_gdef, self._input,
-                                config_no_trt)
-
-    # Run calibration if necessary.
-    if precision_mode == MODE_INT8:
-
-      calib_config = self._GetConfigProto(use_optimizer, precision_mode,
-                                          dynamic_calib_engine)
-      print("Running calibration graph, config:\n%s" % str(calib_config))
-      if use_optimizer:
-        self.assertTrue(False)
-        # TODO(aaroey): uncomment this and get infer_gdef when this mode is
-        # supported.
-        # result = self._RunCalibration(graph_key, input_gdef, self._input,
-        #                               calib_config)
-      else:
-        calib_gdef = self._GetTrtGraph(input_gdef, precision_mode,
-                                       dynamic_calib_engine)
-        self._VerifyGraphDef(graph_key, calib_gdef, precision_mode, False,
-                             dynamic_calib_engine)
-        result = self._RunCalibration(graph_key, calib_gdef, self._input,
-                                      calib_config)
-        infer_gdef = trt.calib_graph_to_infer_graph(calib_gdef)
-        self._VerifyGraphDef(graph_key, infer_gdef, precision_mode, True,
-                             dynamic_calib_engine)
-      self.assertAllClose(ref_result, result, rtol=1.e-03)
-    else:
-      infer_gdef = input_gdef
-
-    # Run inference.
-    infer_config = self._GetConfigProto(use_optimizer, precision_mode,
-                                        dynamic_infer_engine)
-    print("Running final inference graph, config:\n%s" % str(infer_config))
-    if use_optimizer:
-      result = self._RunGraph(graph_key, infer_gdef, self._input, infer_config)
-    else:
-      trt_infer_gdef = self._GetTrtGraph(infer_gdef, precision_mode,
-                                         dynamic_infer_engine)
-      self._VerifyGraphDef(graph_key, trt_infer_gdef, precision_mode, True,
-                           dynamic_infer_engine)
-      result = self._RunGraph(graph_key, trt_infer_gdef, self._input,
-                              infer_config)
-    self.assertAllClose(ref_result, result, rtol=1.e-03)
-
-  def testIdempotence(self):
-    # Test that applying tensorrt optimizer or offline conversion tools multiple
-    # times to the same graph will result in same graph.
-    #
-    # TODO(aaroey): currently the conversion is not deterministic, this is
-    # mainly because during tensorflow::ConvertGraphDefToGraph(), the graph uses
-    # EdgeSet which use a map keyed by Edge*, so the order of input/output edges
-    # of a node is nondeterministic, thus the order for segmenter to contract
-    # edges is nondeterministic. Need to evaluate whether we should fix this.
-    pass
-
-
-def GetTests():
-
-  def _GetTest(g, u, p, i, c):
-
-    def _Test(self):
-      print("Running test with parameters: graph_key=%s, use_optimizer=%s, "
-            "precision_mode=%s, dynamic_infer_engine=%s, "
-            "dynamic_calib_engine=%s" % (g, u, p, i, c))
-      self._RunTest(g, u, p, i, c)
-
-    return _Test
-
-  use_optimizer_options = [False, True]
-  precision_mode_options = [MODE_FP32, MODE_FP16, MODE_INT8]
-  dynamic_infer_engine_options = [False, True]
-  dynamic_calib_engine_options = [False, True]
-  for (graph_key, use_optimizer, precision_mode,
-       dynamic_infer_engine, dynamic_calib_engine) in itertools.product(
-           TEST_GRAPHS, use_optimizer_options, precision_mode_options,
-           dynamic_infer_engine_options, dynamic_calib_engine_options):
-    if precision_mode == MODE_INT8:
-      if not dynamic_calib_engine and dynamic_infer_engine:
-        # TODO(aaroey): test this case, the conversion from static calibration
-        # engine to dynamic inference engine should be a noop.
-        continue
-      if use_optimizer:
-        # TODO(aaroey): if use_optimizer is True we need to get the inference
-        # graphdef using custom python wrapper class, which is not currently
-        # supported yet.
-        continue
-      if not dynamic_calib_engine:
-        # TODO(aaroey): construction of static calibration engine is not
-        # supported yet.
-        continue
-      if dynamic_calib_engine and not dynamic_infer_engine:
-        # TODO(aaroey): construction of static inference engine using dynamic
-        # calibration engine is not supported yet.
-        continue
-    else:  # In non int8 mode.
-      if dynamic_calib_engine:
-        # dynamic_calib_engine doesn't affect non-int8 modes, so just let
-        # related tests run once on dynamic_calib_engine=False.
-        continue
-    yield _GetTest(graph_key, use_optimizer, precision_mode,
-                   dynamic_infer_engine, dynamic_calib_engine)
-
-
-if __name__ == "__main__":
-  if trt.is_tensorrt_enabled():
-    for index, t in enumerate(GetTests()):
-      setattr(TfTrtIntegrationTest, "testTfTRT_" + str(index), t)
-  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb7f5a77f011ee5c4fe748c246ac632a7bb19aff
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -0,0 +1,329 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import itertools
+import warnings
+import numpy as np
+import six
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+
+TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
+    "gdef", "input_names", "input_dims", "num_expected_engines",
+    "expected_output_dims", "allclose_atol", "allclose_rtol"
+])
+
+PRECISION_MODES = ["FP32", "FP16", "INT8"]
+
+
+def _IsQuantizationMode(mode):
+  return mode == "INT8"
+
+
+class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
+  """Class to test Tensorflow-TensorRT integration."""
+
+  @property
+  def output_name(self):
+    return "output"
+
+  @property
+  def trt_incompatible_op(self):
+    return math_ops.sin
+
+  @property
+  def precision_modes(self):
+    return ["FP32", "FP16", "INT8"]
+
+  def _ToBytes(self, s):
+    if six.PY2:
+      return s
+    else:
+      return s.encode("utf-8")
+
+  def _ToString(self, s):
+    if six.PY2:
+      return s
+    else:
+      return s.decode("utf-8")
+
+  def setUp(self):
+    """Setup method."""
+    super(TfTrtIntegrationTestBase, self).setUp()
+    warnings.simplefilter("always")
+
+  def GetParams(self):
+    """Return a TfTrtIntegrationTestParams for test, implemented by subclass."""
+    raise NotImplementedError()
+
+  def _GetConfigProto(self,
+                      params,
+                      use_optimizer,
+                      precision_mode=None,
+                      is_dynamic_op=None):
+    """Get config proto based on specific settings."""
+    if use_optimizer:
+      rewriter_cfg = rewriter_config_pb2.RewriterConfig()
+      rewriter_cfg.optimizers.extend(["constfold", "layout"])
+      custom_op = rewriter_cfg.custom_optimizers.add()
+      custom_op.name = "TensorRTOptimizer"
+      custom_op.parameter_map["minimum_segment_size"].i = 3
+      custom_op.parameter_map["max_batch_size"].i = max(
+          [dims[0] for dims in params.input_dims])
+      custom_op.parameter_map["is_dynamic_op"].b = is_dynamic_op
+      custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
+      custom_op.parameter_map["precision_mode"].s = self._ToBytes(
+          precision_mode)
+      graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
+    else:
+      graph_options = config_pb2.GraphOptions()
+
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    if trt_convert.get_linked_tensorrt_version()[0] == 3:
+      gpu_options.per_process_gpu_memory_fraction = 0.50
+
+    config = config_pb2.ConfigProto(
+        gpu_options=gpu_options, graph_options=graph_options)
+    return config
+
+  def _RunGraph(self, params, gdef, input_data, config, num_runs=2):
+    """Run given graphdef multiple times."""
+    assert len(params.input_names) == len(input_data)
+    g = ops.Graph()
+    with g.as_default():
+      io_ops = importer.import_graph_def(
+          graph_def=gdef,
+          return_elements=params.input_names + [self.output_name],
+          name="")
+      inp = [i.outputs[0] for i in io_ops[:-1]]
+      assert len(inp) == len(input_data)
+      out = io_ops[-1].outputs[0]
+    with self.test_session(
+        graph=g, config=config, use_gpu=True, force_gpu=True) as sess:
+      val = None
+      # Defaults to 2 runs to verify result across multiple runs is same.
+      for _ in range(num_runs):
+        new_val = sess.run(out,
+                           {inp[i]: input_data[i] for i in range(len(inp))})
+        self.assertEqual(params.expected_output_dims, new_val.shape)
+        if val is not None:
+          self.assertAllEqual(val, new_val)
+        val = new_val
+    return val
+
+  # Use real data that is representative of the inference dataset
+  # for calibration. For this test script it is random data.
+  def _RunCalibration(self, params, gdef, input_data, config):
+    """Run calibration on given graph."""
+    return self._RunGraph(params, gdef, input_data, config, 30)
+
+  def _GetTrtGraphDef(self, params, gdef, precision_mode, is_dynamic_op):
+    """Return trt converted graphdef."""
+    return trt_convert.create_inference_graph(
+        input_graph_def=gdef,
+        outputs=[self.output_name],
+        max_batch_size=max([dims[0] for dims in params.input_dims]),
+        max_workspace_size_bytes=1 << 25,
+        precision_mode=precision_mode,
+        minimum_segment_size=2,
+        is_dynamic_op=is_dynamic_op)
+
+  def _VerifyGraphDef(self,
+                      params,
+                      gdef,
+                      precision_mode=None,
+                      is_calibrated=None,
+                      dynamic_engine=None):
+    num_engines = 0
+    for n in gdef.node:
+      # TODO(jie): we should have coverage for failed conversion (TF fallback).
+      # where the conversion will fail and we shouldn't count this engine as the
+      # converted engines.
+      if n.op == "TRTEngineOp":
+        num_engines += 1
+        self.assertNotEqual(self._ToBytes(""), n.attr["serialized_segment"].s)
+        self.assertNotEqual(self._ToBytes(""), n.attr["segment_funcdef_name"].s)
+        self.assertEqual(
+            self._ToBytes(precision_mode), n.attr["precision_mode"].s)
+        self.assertEqual(not dynamic_engine, n.attr["static_engine"].b)
+        if _IsQuantizationMode(precision_mode) and is_calibrated:
+          self.assertNotEqual(self._ToBytes(""), n.attr["calibration_data"].s)
+        else:
+          self.assertEqual(self._ToBytes(""), n.attr["calibration_data"].s)
+    if precision_mode is None:  # This means gdef is the original GraphDef.
+      self.assertEqual(0, num_engines)
+    else:
+      self.assertEqual(num_engines, params.num_expected_engines)
+
+  def RunTest(self, params, use_optimizer, precision_mode,
+              dynamic_infer_engine, dynamic_calib_engine):
+    assert precision_mode in PRECISION_MODES
+    input_data = [np.random.random_sample(dims) for dims in params.input_dims]
+    input_gdef = params.gdef
+    self._VerifyGraphDef(params, input_gdef)
+
+    # Get reference result without running trt.
+    config_no_trt = self._GetConfigProto(params, False)
+    logging.info("Running original graph w/o trt, config:\n%s",
+                 str(config_no_trt))
+    ref_result = self._RunGraph(params, input_gdef, input_data, config_no_trt)
+
+    # Run calibration if necessary.
+    if _IsQuantizationMode(precision_mode):
+
+      calib_config = self._GetConfigProto(params, use_optimizer, precision_mode,
+                                          dynamic_calib_engine)
+      logging.info("Running calibration graph, config:\n%s", str(calib_config))
+      if use_optimizer:
+        self.assertTrue(False)
+        # TODO(aaroey): uncomment this and get infer_gdef when this mode is
+        # supported.
+        # result = self._RunCalibration(params, input_gdef, input_data,
+        #                               calib_config)
+      else:
+        calib_gdef = self._GetTrtGraphDef(params, input_gdef, precision_mode,
+                                          dynamic_calib_engine)
+        self._VerifyGraphDef(params, calib_gdef, precision_mode, False,
+                             dynamic_calib_engine)
+        result = self._RunCalibration(params, calib_gdef, input_data,
+                                      calib_config)
+        infer_gdef = trt_convert.calib_graph_to_infer_graph(calib_gdef)
+        self._VerifyGraphDef(params, infer_gdef, precision_mode, True,
+                             dynamic_calib_engine)
+
+      self.assertAllClose(
+          ref_result,
+          result,
+          atol=params.allclose_atol,
+          rtol=params.allclose_rtol)
+    else:
+      infer_gdef = input_gdef
+
+    # Run inference.
+    infer_config = self._GetConfigProto(params, use_optimizer, precision_mode,
+                                        dynamic_infer_engine)
+    logging.info("Running final inference graph, config:\n%s",
+                 str(infer_config))
+    if use_optimizer:
+      result = self._RunGraph(params, infer_gdef, input_data, infer_config)
+    else:
+      trt_infer_gdef = self._GetTrtGraphDef(params, infer_gdef, precision_mode,
+                                            dynamic_infer_engine)
+      self._VerifyGraphDef(params, trt_infer_gdef, precision_mode, True,
+                           dynamic_infer_engine)
+      result = self._RunGraph(params, trt_infer_gdef, input_data, infer_config)
+
+    self.assertAllClose(
+        ref_result,
+        result,
+        atol=params.allclose_atol,
+        rtol=params.allclose_rtol)
+
+  def testIdempotence(self):
+    # Test that applying tensorrt optimizer or offline conversion tools multiple
+    # times to the same graph will result in same graph.
+    #
+    # TODO(aaroey): currently the conversion is not deterministic, this is
+    # mainly because during tensorflow::ConvertGraphDefToGraph(), the graph uses
+    # EdgeSet which use a map keyed by Edge*, so the order of input/output edges
+    # of a node is nondeterministic, thus the order for segmenter to contract
+    # edges is nondeterministic. Need to evaluate whether we should fix this.
+    pass
+
+
+def _AddTests(test_class):
+  """Adds test methods to TfTrtIntegrationTestBase."""
+
+  def _GetTest(use_optimizer, precision_mode, dynamic_infer_engine,
+               dynamic_calib_engine):
+    """Gets a single test method based on the parameters."""
+
+    def _Test(self):
+      params = self.GetParams()
+      logging.info(
+          "Running test with parameters: use_optimizer=%s, precision_mode=%s, "
+          "dynamic_infer_engine=%s, dynamic_calib_engine=%s", use_optimizer,
+          precision_mode, dynamic_infer_engine, dynamic_calib_engine)
+      self.RunTest(params, use_optimizer, precision_mode, dynamic_infer_engine,
+                   dynamic_calib_engine)
+
+    return _Test
+
+  use_optimizer_options = [False, True]
+  dynamic_infer_engine_options = [False, True]
+  dynamic_calib_engine_options = [False, True]
+  for (use_optimizer, precision_mode,
+       dynamic_infer_engine, dynamic_calib_engine) in itertools.product(
+           use_optimizer_options, PRECISION_MODES, dynamic_infer_engine_options,
+           dynamic_calib_engine_options):
+    if _IsQuantizationMode(precision_mode):
+      if not dynamic_calib_engine and dynamic_infer_engine:
+        # TODO(aaroey): test this case, the conversion from static calibration
+        # engine to dynamic inference engine should be a noop.
+        continue
+      if use_optimizer:
+        # TODO(aaroey): if use_optimizer is True we need to get the inference
+        # graphdef using custom python wrapper class, which is not currently
+        # supported yet.
+        continue
+      if not dynamic_calib_engine:
+        # TODO(aaroey): construction of static calibration engine is not
+        # supported yet.
+        continue
+      if dynamic_calib_engine and not dynamic_infer_engine:
+        # TODO(aaroey): construction of static inference engine using dynamic
+        # calibration engine is not supported yet.
+        continue
+    else:  # In non int8 mode.
+      if dynamic_calib_engine:
+        # dynamic_calib_engine doesn't affect non-int8 modes, so just let
+        # related tests run once on dynamic_calib_engine=False.
+        continue
+
+    conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
+    infer_engine_type = ("DynamicInferEngine"
+                         if dynamic_infer_engine else "StaticInferEngine")
+    calib_engine_type = ""
+    if precision_mode == "INT8":
+      calib_engine_type = ("DynamicCalibEngine"
+                           if dynamic_calib_engine else "StaticCalibEngine")
+    test_name = "%s_%s_%s%s" % (conversion, precision_mode, infer_engine_type,
+                                ("_" + calib_engine_type)
+                                if len(calib_engine_type) else "")
+    setattr(
+        test_class, "testTfTRT_" + test_name,
+        _GetTest(use_optimizer, precision_mode, dynamic_infer_engine,
+                 dynamic_calib_engine))
+
+
+if trt_convert.is_tensorrt_enabled():
+  _AddTests(TfTrtIntegrationTestBase)
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9e977cf67b4e94282c10313477276b04ea828aa
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -0,0 +1,110 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class UnaryTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Test for unary operations in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [12, 5, 8, 1, 1, 12]
+    input2_name = "input_2"
+    input2_dims = [12, 5, 8, 1, 12, 1, 1]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      q = math_ops.abs(x)
+      q = q + 1.0
+      q = gen_math_ops.exp(q)
+      q = gen_math_ops.log(q)
+      q = array_ops.squeeze(q, axis=-2)
+      q = math_ops.abs(q)
+      q = q + 2.2
+      q = gen_math_ops.sqrt(q)
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = array_ops.squeeze(q, axis=3)
+      q = math_ops.abs(q)
+      q = q + 3.0
+      a = gen_math_ops.reciprocal(q)
+
+      x = constant_op.constant(np.random.randn(5, 8, 12), dtype=dtype)
+      q = math_ops.abs(x)
+      q = q + 2.0
+      q = gen_math_ops.exp(q)
+      q = gen_math_ops.log(q)
+      q = math_ops.abs(q)
+      q = q + 2.1
+      q = gen_math_ops.sqrt(q)
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = math_ops.abs(q)
+      q = q + 4.0
+      b = gen_math_ops.reciprocal(q)
+
+      # TODO(jie): this one will break, broadcasting on batch.
+      x = array_ops.placeholder(
+          dtype=dtype, shape=input2_dims, name=input2_name)
+      q = math_ops.abs(x)
+      q = q + 5.0
+      q = gen_math_ops.exp(q)
+      q = array_ops.squeeze(q, axis=[-1, -2, 3])
+      q = gen_math_ops.log(q)
+      q = math_ops.abs(q)
+      q = q + 5.1
+      q = gen_array_ops.reshape(q, [12, 5, 1, 1, 8, 1, 12])
+      q = array_ops.squeeze(q, axis=[5, 2, 3])
+      q = gen_math_ops.sqrt(q)
+      q = math_ops.abs(q)
+      q = q + 5.2
+      q = gen_math_ops.rsqrt(q)
+      q = math_ops.negative(q)
+      q = math_ops.abs(q)
+      q = q + 5.3
+      c = gen_math_ops.reciprocal(q)
+
+      q = a * b
+      q = q / c
+      array_ops.squeeze(q, name=self.output_name)
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name, input2_name],
+        input_dims=[input_dims, input2_dims],
+        num_expected_engines=5,
+        expected_output_dims=(12, 5, 8, 12),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b134c3bce2b36e4530f8f8e58cce8d07c9bb13b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Single vgg layer in NCHW unit tests in TF-TRT."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [5, 2, 8, 8]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      x, _, _ = nn_impl.fused_batch_norm(
+          x,
+          np.random.randn(2).astype(np.float32),
+          np.random.randn(2).astype(np.float32),
+          mean=np.random.randn(2).astype(np.float32),
+          variance=np.random.randn(2).astype(np.float32),
+          data_format="NCHW",
+          is_training=False)
+      e = constant_op.constant(
+          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x,
+          filter=e,
+          data_format="NCHW",
+          strides=[1, 1, 2, 2],
+          padding="SAME",
+          name="conv")
+      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+      t = nn.bias_add(conv, b, data_format="NCHW", name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = array_ops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 1, 2, 2], [1, 1, 2, 2],
+          "VALID",
+          data_format="NCHW",
+          name="max_pool")
+      array_ops.squeeze(v, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 6, 2, 2),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec2f23eff3b1799d70519462f42c326d17924c1
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -0,0 +1,73 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Single vgg layer test in TF-TRT conversion."""
+    dtype = dtypes.float32
+    input_name = "input"
+    input_dims = [5, 8, 8, 2]
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name)
+      x, _, _ = nn_impl.fused_batch_norm(
+          x,
+          np.random.randn(2).astype(np.float32),
+          np.random.randn(2).astype(np.float32),
+          mean=np.random.randn(2).astype(np.float32),
+          variance=np.random.randn(2).astype(np.float32),
+          is_training=False)
+      e = constant_op.constant(
+          np.random.randn(1, 1, 2, 6), name="weights", dtype=dtype)
+      conv = nn.conv2d(
+          input=x, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
+      b = constant_op.constant(np.random.randn(6), name="bias", dtype=dtype)
+      t = nn.bias_add(conv, b, name="biasAdd")
+      relu = nn.relu(t, "relu")
+      idty = array_ops.identity(relu, "ID")
+      v = nn_ops.max_pool(
+          idty, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+      array_ops.squeeze(v, name="output")
+    return trt_test.TfTrtIntegrationTestParams(
+        gdef=g.as_graph_def(),
+        input_names=[input_name],
+        input_dims=[input_dims],
+        num_expected_engines=1,
+        expected_output_dims=(5, 2, 2, 6),
+        allclose_atol=1.e-03,
+        allclose_rtol=1.e-03)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index ef6c75285195adfbe7181fc7aa084a62640ff22c..5a7825f29a29585af87c113b2475fb9a1d795d75 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -15,6 +15,7 @@ package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
         "//learning/brain:__subpackages__",
+        "//learning/deepmind:__subpackages__",
         "//tensorflow:__subpackages__",
     ],
 )
@@ -36,6 +37,7 @@ cc_library(
 py_library(
     name = "tpu_estimator",
     srcs = [
+        "python/tpu/error_handling.py",
         "python/tpu/tpu_config.py",
         "python/tpu/tpu_context.py",
         "python/tpu/tpu_estimator.py",
@@ -165,6 +167,17 @@ py_library(
         "python/tpu/keras_support.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//learning/brain:__subpackages__",
+        # TODO(b/111651964): Clean special visibility for keras_support.
+        #
+        # Note: If you are an end user, please do not add your project to this
+        # visibility. This feature is experimental, and will be made public
+        # when ready.
+        "//third_party/cloud_tpu/models/keras:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
     deps = [
         ":tpu_lib",
         ":tpu_py",
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index f0fca63db0bca80cdaa27e491b2a03ae2246c007..da4a95e0450a9d0c20593ca60b69f3ad467d455d 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -11,6 +11,9 @@ service TPUProfiler {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {
   }
+  // Collects profiling data and returns user-friendly metrics.
+  rpc Monitor(MonitorRequest) returns (MonitorResponse) {
+  }
 }
 
 message ProfileOptions {
@@ -104,3 +107,26 @@ message ProfileResponse {
 
   // next-field: 8
 }
+
+message MonitorRequest {
+  // Duration for which to profile between each update.
+  uint64 duration_ms = 1;
+
+  // Indicates the level at which we want to monitor. Currently, two levels are
+  // supported:
+  // Level 1: An ultra lightweight mode that captures only some utilization
+  // metrics.
+  // Level 2: More verbose than level 1. Collects utilization metrics, device
+  // information, step time information, etc. Do not use this option if the TPU
+  // host is being very heavily used.
+  int32 monitoring_level = 2;
+
+  // next-field: 3
+}
+
+message MonitorResponse {
+  // Properly formatted string data that can be directly returned back to user.
+  string data = 1;
+
+  // next-field: 2
+}
diff --git a/tensorflow/contrib/tpu/proto/optimization_parameters.proto b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
index 9150606f5eb254a824f4632df53329cbf2edf570..2cc17d6d928370afbb0e3b1e89252f7a687c27d3 100644
--- a/tensorflow/contrib/tpu/proto/optimization_parameters.proto
+++ b/tensorflow/contrib/tpu/proto/optimization_parameters.proto
@@ -1,10 +1,12 @@
-syntax = "proto2";
+syntax = "proto3";
 
 package tensorflow.tpu;
 
+import "google/protobuf/wrappers.proto";
+
 message ClippingLimits {
-  optional float lower = 1 [default = -inf];
-  optional float upper = 2 [default = inf];
+  google.protobuf.FloatValue lower = 1;  // -inf if not set
+  google.protobuf.FloatValue upper = 2;  // +inf if not set
 }
 
 // Get the learning rate from a <yet to be determined> source that can change
@@ -21,18 +23,18 @@ message LearningRate {
 }
 
 message AdagradParameters {
-  optional float initial_accumulator = 1 [default = 0.];
+  float initial_accumulator = 1;
 }
 
 message StochasticGradientDescentParameters {
 }
 
 message FtrlParameters {
-  optional float l1 = 1 [default = 0.];
-  optional float l2 = 2 [default = 0.];
-  optional float lr_power = 3 [default = 0.];
-  optional float initial_accum = 4 [default = 0.];
-  optional float initial_linear = 5 [default = 0.];
+  float l1 = 1;
+  float l2 = 2;
+  float lr_power = 3;
+  float initial_accum = 4;
+  float initial_linear = 5;
 }
 
 // The Adam optimizer does not implement hyper-parameter update; use the dynamic
@@ -41,84 +43,84 @@ message FtrlParameters {
 // Here, t is the current timestep.
 // https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
 message AdamParameters {
-  optional float beta1 = 3 [default = 0.];
-  optional float beta2 = 4 [default = 0.];
-  optional float epsilon = 5 [default = 0.];
-  optional float initial_m = 6 [default = 0.];
-  optional float initial_v = 7 [default = 0.];
+  float beta1 = 3;
+  float beta2 = 4;
+  float epsilon = 5;
+  float initial_m = 6;
+  float initial_v = 7;
 }
 
 message MomentumParameters {
-  optional float momentum = 1 [default = 0.];
-  optional bool use_nesterov = 2 [default = false];
-  optional float initial_accum = 3 [default = 0.];
+  float momentum = 1;
+  bool use_nesterov = 2;
+  float initial_accum = 3;
 }
 
 message RmsPropParameters {
-  optional float rho = 1 [default = 0.];
-  optional float momentum = 2 [default = 0.];
-  optional float epsilon = 3 [default = 0.];
-  optional float initial_ms = 4 [default = 0.];
-  optional float initial_mom = 5 [default = 0.];
+  float rho = 1;
+  float momentum = 2;
+  float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
 }
 
 message CenteredRmsPropParameters {
-  optional float rho = 1 [default = 0.];
-  optional float momentum = 2 [default = 0.];
-  optional float epsilon = 3 [default = 0.];
-  optional float initial_ms = 4 [default = 0.];
-  optional float initial_mom = 5 [default = 0.];
-  optional float initial_mg = 6 [default = 0.];
+  float rho = 1;
+  float momentum = 2;
+  float epsilon = 3;
+  float initial_ms = 4;
+  float initial_mom = 5;
+  float initial_mg = 6;
 }
 
 message MdlAdagradLightParameters {
-  optional float l2 = 1;
-  optional float lr_power = 2;
-  optional float min_servable_mdl_benefit = 3;
-  optional float mdl_mix_in_margin = 4;
-  optional float mdl_benefit_rampup_coeff = 5;
-  optional float mdl_min_weight = 6;
-  optional float benefit_revisit_scale = 7;
-  optional float max_event_benefit = 8;
-  optional float max_total_benefit = 9;
-  optional float mdl_hard_limit = 10;
-  optional bool hard_limit_min_benefit = 11;
-  optional bool mdl_regularize = 12;
-  optional float initial_accumulator = 13;
-  optional float initial_weight = 14;
-  optional float initial_benefit = 15;
+  float l2 = 1;
+  float lr_power = 2;
+  float min_servable_mdl_benefit = 3;
+  float mdl_mix_in_margin = 4;
+  float mdl_benefit_rampup_coeff = 5;
+  float mdl_min_weight = 6;
+  float benefit_revisit_scale = 7;
+  float max_event_benefit = 8;
+  float max_total_benefit = 9;
+  float mdl_hard_limit = 10;
+  bool hard_limit_min_benefit = 11;
+  bool mdl_regularize = 12;
+  float initial_accumulator = 13;
+  float initial_weight = 14;
+  float initial_benefit = 15;
 }
 
 message AdadeltaParameters {
-  optional float rho = 1;
-  optional float epsilon = 2;
-  optional float initial_accumulator = 3 [default = 0.];
-  optional float initial_update = 4 [default = 0.];
+  float rho = 1;
+  float epsilon = 2;
+  float initial_accumulator = 3;
+  float initial_update = 4;
 }
 
 message ProximalAdagradParameters {
-  optional float l1 = 1;
-  optional float l2 = 2;
-  optional float initial_accumulator = 3;
+  float l1 = 1;
+  float l2 = 2;
+  float initial_accumulator = 3;
 }
 
 message OptimizationParameters {
   // Learning rate used for updating the embedding layer parameters.
-  optional LearningRate learning_rate = 13;
+  LearningRate learning_rate = 13;
   reserved 1;  // Old learning rate tag.
 
   // Limits to which to clip the weight values after the backward pass; not
   // present means no limits are applied.
-  optional ClippingLimits clipping_limits = 2;
+  ClippingLimits clipping_limits = 2;
 
   // Limits to which to clip the backward pass gradient before using it for
   // updates; not present means no limits are applied.
-  optional ClippingLimits gradient_clipping_limits = 7;
+  ClippingLimits gradient_clipping_limits = 7;
 
   // Whether to use gradient accumulation (do two passes over the input
   // gradients: one to accumulate them into a temporary array and another to
   // apply them using the actual optimization algorithm).
-  optional bool use_gradient_accumulation = 15 [default = false];
+  bool use_gradient_accumulation = 15;
 
   // Optimization algorithm parameters; which field is selected determines which
   // algorithm to use.
@@ -140,7 +142,7 @@ message OptimizationParameters {
 // value vector and any extra accumulators, etc.).
 message StateVariableSpecification {
   // Parameter name for the state variable.
-  optional string name = 1;
+  string name = 1;
 
   // A normal state variable that should be saved and restored in checkpoints
   // and used as an input or output to non-debug TensorFlow ops.
@@ -151,7 +153,7 @@ message StateVariableSpecification {
   // from users (used for intermediate gradients being accumulated, for
   // example).
   message FillWithConstant {
-    optional double initial_value = 1;
+    double initial_value = 1;
   }
 
   // Usage type of this state variable.
diff --git a/tensorflow/contrib/tpu/python/tpu/error_handling.py b/tensorflow/contrib/tpu/python/tpu/error_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e1ea42370d653d1de7c12eee4b456ec7ce921c
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/error_handling.py
@@ -0,0 +1,132 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===================================================================
+"""ErrorRendezvous handler for collecting errors from multiple threads."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import sys
+import threading
+import time
+
+import six
+
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+
+_UNINTERESTING_ERRORS = (errors.CancelledError,)
+
+
+class ErrorRendezvous(object):
+  """Resolve errors from multiple threads during TPU execution.
+
+  TPU errors can occur on the infeed or outfeed threads as well as the main
+  training thread.
+
+  Depending on which thread "wins" and receives the session error first, we may
+  end up showing users a confusing and non-actionable error message (session
+  cancelled) instead of a root cause (e.g. a bad filename).
+
+  The rendezvous object provides a location to capture these errors until all
+  threads terminate.  At that point we can choose the most informative error
+  to report.
+  """
+
+  def __init__(self, num_sources):
+    # string -> (message, traceback)
+    self._errors = {}
+    self._num_sources = num_sources
+    self._session_cancel_timer = None
+
+  def record_error(self, source, exc_info, session=None):
+    """Report an exception from the given source.
+
+    If a session is passed, a timer will be registered to close it after a few
+    seconds.  This is necessary to ensure the main training loop does not hang
+    if an infeed/oufeed error occurs.  We sleep a few seconds to allow a more
+    interesting error from another thread to propagate.
+
+    Args:
+      source: string, source of the error
+      exc_info: Output from `sys.exc_info` (type, value, traceback)
+      session: Session to close after delay.
+    """
+    _, value, _ = exc_info
+    self._errors[source] = exc_info
+    logging.info('Error recorded from %s: %s', source, value)
+
+    if session is not None and self._session_cancel_timer is None:
+
+      def _cancel_session():
+        time.sleep(5)
+        try:
+          session.close()
+        except:  # pylint: disable=bare-except
+          pass
+
+      self._session_cancel_timer = threading.Thread(target=_cancel_session,)
+      self._session_cancel_timer.daemon = True
+      self._session_cancel_timer.start()
+
+  def record_done(self, source):
+    """Mark execution source `source` as done.
+
+    If an error was originally reported from `source` it is left intact.
+
+    Args:
+      source: `str`, source being recorded
+    """
+    logging.info('%s marked as finished', source)
+    if source not in self._errors:
+      self._errors[source] = None
+
+  @contextlib.contextmanager
+  def catch_errors(self, source, session=None):
+    """Context manager to report any errors within a block."""
+    try:
+      yield
+    except Exception:  # pylint: disable=broad-except
+      self.record_error(source, sys.exc_info(), session)
+
+  def raise_errors(self, timeout_sec=0):
+    """Wait for up to `timeout` seconds for all error sources to finish.
+
+    Preferentially raise "interesting" errors (errors not in the
+    _UNINTERESTING_ERRORS) set.
+
+    Args:
+      timeout_sec: Seconds to wait for other error sources.
+    """
+    for _ in range(timeout_sec):
+      if len(self._errors) == self._num_sources:
+        break
+      time.sleep(1)
+
+    kept_errors = [(k, v) for (k, v) in self._errors.items() if v is not None]
+
+    # First check for any interesting errors, then fall back on the session
+    # cancelled errors etc.
+    for k, (typ, value, traceback) in kept_errors:
+      if isinstance(value, _UNINTERESTING_ERRORS):
+        continue
+      else:
+        logging.warn('Reraising captured error')
+        six.reraise(typ, value, traceback)
+
+    for k, (typ, value, traceback) in kept_errors:
+      logging.warn('Reraising captured error')
+      six.reraise(typ, value, traceback)
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 8292c920fc27f3693f581770c80908e5b10cfc45..81798ee42313cb9e2232a4796f56d4d16068b82f 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -441,21 +441,23 @@ class TPUNumpyInfeedManager(TPUInfeedManager):
     shard_infeed_tensors = []
 
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         infeed_tensors = []
-        for spec in input_specs:
-          # Construct placeholders for each of the inputs.
-          infeed_tensors.append(
-              array_ops.placeholder(
-                  dtype=spec.dtype,
-                  shape=spec.shape,
-                  name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
+        with ops.device('/device:TPU:%d' % shard_id):
+          for spec in input_specs:
+            # Construct placeholders for each of the inputs.
+            infeed_tensors.append(
+                array_ops.placeholder(
+                    dtype=spec.dtype,
+                    shape=spec.shape,
+                    name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
         shard_infeed_tensors.append(infeed_tensors)
 
         infeed_op.append(
             tpu_ops.infeed_enqueue_tuple(
                 infeed_tensors, [spec.shape for spec in input_specs],
-                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id)))
+                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id),
+                device_ordinal=shard_id))
     return SizedInfeed(infeed_ops=infeed_op,
                        sharded_infeed_tensors=shard_infeed_tensors)
 
@@ -584,12 +586,13 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     assert len(shard_infeed_tensors) == self._strategy.num_towers
     infeed_ops = []
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         infeed_ops.append(
             tpu_ops.infeed_enqueue_tuple(
                 shard_infeed_tensors[shard_id],
                 [spec.shape for spec in input_specs],
-                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id)))
+                name='infeed-enqueue-%s-%d' % (execution_mode, shard_id),
+                device_ordinal=shard_id))
     return SizedInfeed(infeed_ops=infeed_ops,
                        sharded_infeed_tensors=shard_infeed_tensors)
 
@@ -740,12 +743,13 @@ class TPUFunction(object):
     # Build output ops.
     outfeed_op = []
     for shard_id in range(self._strategy.num_towers):
-      with ops.device('/device:TPU:%d' % shard_id):
+      with ops.device('/device:CPU:0'):
         outfeed_op.extend(
             tpu_ops.outfeed_dequeue_tuple(
                 dtypes=[spec.dtype for spec in self._outfeed_spec],
                 shapes=[spec.shape for spec in self._outfeed_spec],
-                name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id)))
+                name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id),
+                device_ordinal=shard_id))
 
     return TPUModelOp(
         compile_op,
@@ -1126,7 +1130,7 @@ Output shape: %(output_shape)s
       'layer': layer,
       'input_shape': layer.input_shape,
       'output_shape': layer.output_shape
-      })
+  })
 
 
 @experimental
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7216626a58cabe73ac1c3721ccd838cccad4b986..92c1eaba710d888d461dad39766bb9189ad1ab78 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -314,7 +314,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       # Capture the device function stack at the time of first entry
       # since that is the stack that will be used outside_compilation.
       graph = ops.get_default_graph()
-      self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
+      # pylint: disable=protected-access
+      self._outer_device_function_stack = graph._device_function_stack.copy()
+      # pylint: enable=protected-access
     super(TPUReplicateContext, self).Enter()
 
   def HostComputeCore(self):
@@ -633,23 +635,14 @@ def split_compile_and_replicate(computation,
     with tpu_function.tpu_shard_context(
         num_replicas), ops.control_dependencies([metadata]):
 
-      # For backward compatibility reasons, we tag replicated inputs with the
-      # _tpu_replicated_input attribute. This does nothing and exists only for
-      # backward compatibility.
-      # TODO(phawkins): delete the attr_scope after 6/28/2018.
-      # pylint: disable=protected-access
-      with graph._attr_scope({
-          "_tpu_replicated_input": attr_value_pb2.AttrValue(b=True)
-      }):
-        # Add identity ops so even unused inputs are "consumed" by the
-        # computation. This is to avoid orphaned TPUReplicatedInput nodes.
-        # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
-        # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
-        computation_inputs = [
-            array_ops.identity(x, name="replicated_input_{}".format(i))
-            for i, x in enumerate(computation_inputs)
-        ]
-      # pylint: enable=protected-access
+      # Add identity ops so even unused inputs are "consumed" by the
+      # computation. This is to avoid orphaned TPUReplicatedInput nodes.
+      # TODO(phawkins): consider instead pruning unused TPUReplicatedInput
+      # and eliding trivial TPUReplicatedInput/TPUReplicatedOutput pairs.
+      computation_inputs = [
+          array_ops.identity(x, name="replicated_input_{}".format(i))
+          for i, x in enumerate(computation_inputs)
+      ]
 
       # If there is an infeed queue, adds the dequeued values to the
       # computation's inputs.
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index 211c59cb90c78e3bd6cfcdccbf3bfe697bacfe24..a9cf54f77d8192b51af094e71707a958594874f6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -146,24 +146,7 @@ class TPUContext(object):
     # Note that: For the non-model parallelism, the mapping could be
     # a random permutation. The order should not matter in most cases
     # as far as model is replicated to all cores in the system.
-
-    # If the precise replica_id to device mapping is required, please
-    # set the num_cores_per_replica to 1 in TPUConfig to enable the
-    # model parallelism.
-    if self._internal_ctx.model_parallelism_enabled:
-      return RuntimeError(
-          'device_for_replica is not yet implemented for model parallelism. '
-          'b/79689078.')
-
-    master = self._internal_ctx.master_job
-    job_device = '' if master is None else ('/job:%s' % master)
-
-    num_of_replicas_per_host = self._internal_ctx.num_of_replicas_per_host
-    host_id = replica_id / num_of_replicas_per_host
-    ordinal_id = replica_id % num_of_replicas_per_host
-
-    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
-    return (host_device, ordinal_id)
+    return self._internal_ctx.device_for_replica(replica_id)
 
 
 class _InternalTPUContext(object):
@@ -595,7 +578,8 @@ class _InternalTPUContext(object):
         raise ValueError(message)
 
     if mode == model_fn_lib.ModeKeys.TRAIN:
-      if self._train_batch_size % num_replicas != 0:
+      if (self._train_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
         raise ValueError(
             'train batch size {} must be divisible by number of replicas {}'
             .format(self._train_batch_size, num_replicas))
@@ -605,11 +589,12 @@ class _InternalTPUContext(object):
         raise ValueError(
             'eval_batch_size in TPUEstimator constructor cannot be `None`'
             'if .evaluate is running on TPU.')
-      if self._eval_batch_size % num_replicas != 0:
+      if (self._eval_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
         raise ValueError(
             'eval batch size {} must be divisible by number of replicas {}'
             .format(self._eval_batch_size, num_replicas))
-      if num_hosts > 1:
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
         raise ValueError(
             'TPUEstimator.evaluate should be running on single TPU worker. '
             'got {}.'.format(num_hosts))
@@ -619,11 +604,12 @@ class _InternalTPUContext(object):
         raise ValueError(
             'predict_batch_size in TPUEstimator constructor should not be '
             '`None` if .predict is running on TPU.')
-      if self._predict_batch_size % num_replicas != 0:
+      if (self._predict_batch_size % num_replicas != 0 and
+          not self.is_input_broadcast_with_iterators()):
         raise ValueError(
             'predict batch size {} must be divisible by number of replicas {}'
             .format(self._predict_batch_size, num_replicas))
-      if num_hosts > 1:
+      if num_hosts > 1 and not self.is_input_broadcast_with_iterators():
         raise ValueError(
             'TPUEstimator.predict should be running on single TPU worker. '
             'got {}.'.format(num_hosts))
@@ -631,6 +617,33 @@ class _InternalTPUContext(object):
     # Record the state "validated" into lazy dictionary.
     self._lazy_validation_dict[mode] = True
 
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    master = self.master_job
+
+    if self.model_parallelism_enabled:
+      return (self.device_assignment.host_device(
+          replica=replica_id, job=master),
+              self.device_assignment.tpu_ordinal(replica=replica_id))
+
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
 
 class _OneCoreTPUContext(_InternalTPUContext):
   """Special _InternalTPUContext for one core usage."""
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index c7cd7896e041323bbd0da89fe768190b410fa1d3..ee9ad525ee34ff114808a4dc7a49702b19c78543 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -22,9 +22,9 @@ import collections
 import copy
 import os
 import signal
+import sys
 import threading
 import time
-import traceback
 
 import numpy as np
 import six
@@ -32,6 +32,7 @@ from six.moves import queue as Queue  # pylint: disable=redefined-builtin
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import error_handling
 from tensorflow.contrib.tpu.python.tpu import session_support
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
@@ -257,7 +258,10 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
               eval_metrics=None,
               export_outputs=None,
               scaffold_fn=None,
-              host_call=None):
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a validated `TPUEstimatorSpec` instance."""
     host_calls = {}
     if eval_metrics is not None:
@@ -265,6 +269,17 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     if host_call is not None:
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
+
+    training_hooks = list(training_hooks or [])
+    evaluation_hooks = list(evaluation_hooks or [])
+    prediction_hooks = list(prediction_hooks or [])
+
+    for hook in training_hooks + evaluation_hooks + prediction_hooks:
+      if not isinstance(hook, session_run_hook.SessionRunHook):
+        raise TypeError(
+            'All hooks must be SessionRunHook instances, given: {}'.format(
+                hook))
+
     return super(TPUEstimatorSpec, cls).__new__(
         cls,
         mode=mode,
@@ -274,7 +289,10 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
         eval_metrics=eval_metrics,
         export_outputs=export_outputs,
         scaffold_fn=scaffold_fn,
-        host_call=host_call)
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
@@ -290,6 +308,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
+    hooks = list(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -299,9 +318,9 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
         eval_metric_ops=eval_metric_ops,
         export_outputs=self.export_outputs,
         scaffold=scaffold,
-        training_hooks=hooks,
-        evaluation_hooks=hooks,
-        prediction_hooks=hooks)
+        training_hooks=self.training_hooks + hooks,
+        evaluation_hooks=self.evaluation_hooks + hooks,
+        prediction_hooks=self.prediction_hooks + hooks)
 
 
 class _OpQueueContext(object):
@@ -365,17 +384,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                ctx,
                enqueue_ops,
                dequeue_ops,
-               run_infeed_loop_on_coordinator=True):
+               run_infeed_loop_on_coordinator=True,
+               rendezvous=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
+    self._rendezvous = rendezvous
 
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
 
-    self._session_cancel_timer = None
-
     self._feed_error = None
     self._finished = False
 
@@ -392,62 +411,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     for op in summary_writer_init_ops:
       self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0]))
 
-  def _log_error(self, session, error):
-    """Log an infeed or outfeed error.
-
-    This logs a short error message immediately, and schedules a timer to
-    emit the full stack trace and error message after a short period of time.
-    If the main session has terminated by the time the timer triggers, we
-    assume the real source of the error was from the main session and avoid
-    emitting a stack trace for the infeed.
-
-    Args:
-      session: `tf.Session`, session to be terminated error: exception that
-        triggered logging.
-      error: the Exception to log.
-    """
-    logging.warning(
-        '\n\n'
-        'Error occurred during infeed/outfeed.  This may be due to a compile '
-        'error in the main session.  Waiting for a short time for the main '
-        'session to come back.\n\n%s', error)
-
-    self._feed_error = traceback.format_exc()
-
-    # If we've already encountered a feed error, don't schedule another
-    # cancellation op.
-    if self._session_cancel_timer:
-      return
-
-    def _cancel_session():
-      """Close the session to avoid the main thread from hanging.
-
-      If input pipeline triggers any error, the infeed thread dies but the main
-      thread for TPU computation waits for the infeed enqueue forever. Close the
-      Session to cancel the main thread Session.run execution.
-
-      We sleep for a few seconds before closing to give some time for the TPU
-      compilation error, if any, propagating, from TPU to CPU host. Compilation
-      errors should be reported by the main thread so that the program can be
-      interrupted and users can take action.  Due to a race condition, the
-      infeed thread might see an error first.  Closing the session here
-      immediately would result in a session cancellation exception in the main
-      thread, instead of the expected compile error.  User code that depends on
-      having the proper exception type will therefore be confused.
-      """
-      time.sleep(5)
-
-      # If the main session is still running, the infeed/outfeed errors are
-      # legitimate, and should be logged.
-      if not self._finished and self._feed_error:
-        logging.error('Feed error: %s', self._feed_error)
-        logging.error('Closing session.  A RuntimeError should follow.')
-        session.close()
-
-    self._session_cancel_timer = threading.Thread(target=_cancel_session)
-    self._session_cancel_timer.daemon = True
-    self._session_cancel_timer.start()
-
   def _run_infeed(self, queue_ctx, session):
     logging.info('Starting infeed thread controller.')
     if self._initial_infeed_sleep_secs:
@@ -456,7 +419,7 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       time.sleep(self._initial_infeed_sleep_secs)
       logging.info('%s thread starting after sleep', self._name)
 
-    try:
+    with self._rendezvous.catch_errors(source='infeed', session=session):
       if self._run_infeed_loop_on_coordinator:
         for count, steps in enumerate(queue_ctx.read_iteration_counts()):
           for i in xrange(steps):
@@ -466,19 +429,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
         for _ in queue_ctx.read_iteration_counts():
           session.run(self._enqueue_ops)
       logging.info('Infeed thread finished, shutting down.')
-    except Exception as e:  # pylint: disable=broad-except
-      self._log_error(session, e)
 
   def _run_outfeed(self, queue_ctx, session):
     logging.info('Starting outfeed thread controller.')
-    try:
+    with self._rendezvous.catch_errors(source='outfeed', session=session):
       for count, steps in enumerate(queue_ctx.read_iteration_counts()):
         for i in xrange(steps):
           logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i)
           session.run(self._dequeue_ops)
       logging.info('Outfeed thread finished, shutting down.')
-    except Exception as e:  # pylint: disable=broad-except
-      self._log_error(session, e)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpQueueContext(name=name, target=target, args=args)
@@ -497,11 +456,6 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def before_run(self, run_context):
     self._feed_error = None
 
-    # Wait for the cancellation timer to complete before continuing.
-    if self._session_cancel_timer:
-      self._session_cancel_timer.join()
-      self._session_cancel_timer = None
-
     iterations = run_context.session.run(self._iterations_per_loop_var)
 
     logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations)
@@ -512,16 +466,14 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     self._outfeed_controller.send_next_batch_signal(iterations)
 
   def end(self, session):
-    if self._session_cancel_timer:
-      logging.warning('Feed error occurred; waiting for message.')
-      self._session_cancel_timer.join()
-
     self._finished = True
     logging.info('Stop infeed thread controller')
     self._infeed_controller.join()
+    self._rendezvous.record_done('infeed')
 
     logging.info('Stop output thread controller')
     self._outfeed_controller.join()
+    self._rendezvous.record_done('outfeed')
 
     logging.info('Shutdown TPU system.')
     session.run(self._finalize_ops)
@@ -529,9 +481,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
-        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False)
+        ctx, enqueue_ops, dequeue_ops, run_infeed_loop_on_coordinator=False,
+        rendezvous=rendezvous)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpSignalOnceQueueContext(name=name, target=target, args=args)
@@ -856,9 +809,18 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
 
     is_dataset = inputs.is_dataset
     if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
-      raise TypeError('Mode PREDICT not yet supported in BROADCAST mode.')
+      if not is_dataset:
+        raise TypeError(
+            'For mode PREDICT, `input_fn` must return `Dataset` instead of '
+            '`features` and `labels`.')
 
-    hooks.append(inputs.dataset_initializer_hook())
+      inputs = _InputsWithStoppingSignals(
+          dataset=inputs.dataset,
+          batch_size=ctx.batch_size_for_input_fn,
+          add_padding=True)
+
+    if is_dataset:
+      hooks.append(inputs.dataset_initializer_hook())
     num_replicas_per_host = ctx.num_of_replicas_per_host
 
   def tpu_ordinal_function_impl(replica_id):
@@ -874,6 +836,7 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
     """Generates enqueue ops for all the hosts."""
     broadcasted_inputs = []
     flattened_inputs = None  # Cache result from input_fn.
+    signals = None
     for host_id in xrange(num_hosts):
       with ops.device(ctx.tpu_host_placement_function(host_id=host_id)):
         for _ in xrange(ctx.num_of_replicas_per_host):
@@ -883,11 +846,13 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
           # hosts).
           if flattened_inputs is None:
             features, labels = inputs.features_and_labels()  # Calls get_next()
+            signals = inputs.signals()
+
             inputs_structure_recorder.validate_and_record_structure(
-                features, labels)
+                features, labels, signals)
             flattened_inputs = (
                 inputs_structure_recorder.flatten_features_and_labels(
-                    features, labels))
+                    features, labels, signals))
           broadcasted_inputs.append(flattened_inputs)
 
     infeed_queue = tpu_feed.InfeedQueue(
@@ -897,7 +862,14 @@ def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder,
         broadcasted_inputs,
         tpu_ordinal_function=tpu_ordinal_function_impl,
         placement_function=device_function_impl)
-    return enqueue_ops
+
+    if signals is None:
+      return enqueue_ops
+    else:
+      return {
+          'ops': enqueue_ops,
+          'signals': signals,
+      }
 
   return enqueue_ops_fn, captured_infeed_queue, hooks, is_dataset
 
@@ -1144,9 +1116,11 @@ class _InputPipeline(object):
       all_hooks.extend(hooks)
       if is_dataset:
         run_infeed_loop_on_coordinator = False
-        enqueue_ops.append(
-            _wrap_computation_in_while_loop(
-                device=host_device, op_fn=enqueue_ops_fn))
+        wrap_fn = (
+            _wrap_computation_in_while_loop
+            if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else
+            _wrap_computation_in_while_loop_with_stopping_signals)
+        enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn))
       else:
         enqueue_ops.append(enqueue_ops_fn())
       infeed_queues.append(captured_infeed_queue.get())
@@ -1264,6 +1238,7 @@ class _ModelFnWrapper(object):
 
     host_call = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_training_hooks = _CapturedObject()
 
     def train_step(loss):
       """Training step function for use inside a while loop."""
@@ -1280,6 +1255,8 @@ class _ModelFnWrapper(object):
       else:
         captured_scaffold_fn.capture(None)
 
+      captured_training_hooks.capture(estimator_spec.training_hooks)
+
       # We must run train_op to update the variables prior to running the
       # outfeed.
       with ops.control_dependencies([train_op]):
@@ -1291,7 +1268,8 @@ class _ModelFnWrapper(object):
         with ops.control_dependencies(host_call_outfeed_ops):
           return array_ops.identity(loss)
 
-    return train_step, host_call, captured_scaffold_fn
+    return (train_step, host_call, captured_scaffold_fn,
+            captured_training_hooks)
 
   def convert_to_single_tpu_eval_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single eval step on TPU.
@@ -1321,6 +1299,7 @@ class _ModelFnWrapper(object):
     """
     host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_eval_hooks = _CapturedObject()
 
     def eval_step(total_loss):
       """Evaluation step function for use inside a while loop."""
@@ -1335,6 +1314,8 @@ class _ModelFnWrapper(object):
 
       loss = tpu_estimator_spec.loss
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks)
+
       to_record = {}
       if tpu_estimator_spec.eval_metrics:
         to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics
@@ -1347,7 +1328,7 @@ class _ModelFnWrapper(object):
       with ops.control_dependencies(host_calls.create_enqueue_op()):
         return math_ops.add(total_loss, loss)
 
-    return eval_step, host_calls, captured_scaffold_fn
+    return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
 
   def convert_to_single_tpu_predict_step(self, dequeue_fn):
     """Converts user provided model_fn` as a single predict step on TPU.
@@ -1362,6 +1343,7 @@ class _ModelFnWrapper(object):
     """
     host_calls = _OutfeedHostCall(self._ctx)
     captured_scaffold_fn = _CapturedObject()
+    captured_predict_hooks = _CapturedObject()
 
     def predict_step(unused_scalar_stopping_signal):
       """Evaluation step function for use inside a while loop."""
@@ -1382,6 +1364,7 @@ class _ModelFnWrapper(object):
       self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
 
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
+      captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks)
       to_record = {}
       identity_fn = lambda **kwargs: kwargs
       to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
@@ -1393,7 +1376,8 @@ class _ModelFnWrapper(object):
       with ops.control_dependencies(host_calls.create_enqueue_op()):
         return _StopSignals.as_scalar_stopping_signal(stopping_signals)
 
-    return predict_step, host_calls, captured_scaffold_fn
+    return (predict_step, host_calls, captured_scaffold_fn,
+            captured_predict_hooks)
 
   def _verify_tpu_spec_predictions(self, predictions):
     """Validates TPUEstimatorSpec.predictions dict."""
@@ -1515,11 +1499,9 @@ class _ModelFnWrapper(object):
 
     err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
     if estimator_spec.training_chief_hooks:
-      raise ValueError(err_msg.format('training_chief_hooks'))
-    if estimator_spec.training_hooks:
-      raise ValueError(err_msg.format('training_hooks'))
-    if estimator_spec.evaluation_hooks:
-      raise ValueError(err_msg.format('evaluation_hooks'))
+      raise ValueError(
+          err_msg.format('training_chief_hooks') + 'If you want' +
+          ' to pass training hooks, please pass via training_hooks.')
 
     if estimator_spec.scaffold:
       logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. '
@@ -1659,11 +1641,13 @@ class _OutfeedHostCall(object):
     # Outfeed ops execute on each replica's first logical core. Note: we must
     # constraint it such that we have at most one outfeed dequeue and enqueue
     # per replica.
-    tpu_device_placement_fn = self._ctx.tpu_device_placement_function
     for i in xrange(self._ctx.num_replicas):
-      with ops.device(tpu_device_placement_fn(i)):
+      host_device, ordinal_id = self._ctx.device_for_replica(i)
+      with ops.device(host_device):
         outfeed_tensors = tpu_ops.outfeed_dequeue_tuple(
-            dtypes=tensor_dtypes, shapes=tensor_shapes)
+            dtypes=tensor_dtypes,
+            shapes=tensor_shapes,
+            device_ordinal=ordinal_id)
         for j, item in enumerate(outfeed_tensors):
           dequeue_ops[j].append(item)
 
@@ -1787,6 +1771,9 @@ class InstallSignalHandlerHook(session_run_hook.SessionRunHook):
 class TPUEstimator(estimator_lib.Estimator):
   """Estimator with TPU support.
 
+  TPUEstimator also supports training on CPU and GPU. You don't need to define
+  a separate `tf.estimator.Estimator`.
+
   TPUEstimator handles many of the details of running on TPU devices, such as
   replicating inputs and models for each core, and returning to host
   periodically to run hooks.
@@ -1824,7 +1811,8 @@ class TPUEstimator(estimator_lib.Estimator):
   Current limitations:
   --------------------
 
-  1. TPU evaluation only works on a single host (one TPU worker).
+  1. TPU evaluation only works on a single host (one TPU worker) except
+     BROADCAST mode.
 
   2. `input_fn` for evaluation should **NOT** raise an end-of-input exception
      (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all
@@ -1995,10 +1983,9 @@ class TPUEstimator(estimator_lib.Estimator):
     """Constructs an `TPUEstimator` instance.
 
     Args:
-      model_fn: Model function as required by `Estimator`. For training, the
-        returned `EstimatorSpec` cannot have hooks as it is not supported in
-        `TPUEstimator`. Instead, the user can pass the training hooks as
-        an argument to `TPUEstimator.train()`.
+      model_fn: Model function as required by `Estimator` which returns
+      EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks',
+      and `prediction_hooks` must not capure any TPU Tensor inside the model_fn.
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -2110,6 +2097,7 @@ class TPUEstimator(estimator_lib.Estimator):
     self._export_to_tpu = export_to_tpu
 
     self._is_input_fn_invoked = None
+    self._rendezvous = {}
 
   def _add_meta_graph_for_mode(self,
                                builder,
@@ -2353,6 +2341,65 @@ class TPUEstimator(estimator_lib.Estimator):
     """
     pass
 
+  def train(self,
+            input_fn,
+            hooks=None,
+            steps=None,
+            max_steps=None,
+            saving_listeners=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous
+    try:
+      return super(TPUEstimator, self).train(
+          input_fn=input_fn, hooks=hooks, steps=steps, max_steps=max_steps,
+          saving_listeners=saving_listeners
+      )
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('training_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('training_loop')
+      rendezvous.raise_errors()
+
+  def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
+               name=None):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous
+    try:
+      return super(TPUEstimator, self).evaluate(
+          input_fn, steps=steps, hooks=hooks, checkpoint_path=checkpoint_path,
+          name=name
+      )
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('evaluation_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('evaluation_loop')
+      rendezvous.raise_errors()
+
+  def predict(self,
+              input_fn,
+              predict_keys=None,
+              hooks=None,
+              checkpoint_path=None,
+              yield_single_examples=True):
+    rendezvous = error_handling.ErrorRendezvous(num_sources=3)
+    self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous
+    try:
+      for result in super(TPUEstimator, self).predict(
+          input_fn=input_fn,
+          predict_keys=predict_keys,
+          hooks=hooks,
+          checkpoint_path=checkpoint_path,
+          yield_single_examples=yield_single_examples):
+        yield result
+    except Exception:  # pylint: disable=broad-except
+      rendezvous.record_error('prediction_loop', sys.exc_info())
+    finally:
+      rendezvous.record_done('prediction_loop')
+      rendezvous.raise_errors()
+
+    rendezvous.record_done('prediction_loop')
+    rendezvous.raise_errors()
+
   def _augment_model_fn(self, model_fn, batch_axis):
     """Returns a new model_fn, which wraps the TPU support."""
 
@@ -2407,7 +2454,7 @@ class TPUEstimator(estimator_lib.Estimator):
             graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op)
 
         if mode == model_fn_lib.ModeKeys.TRAIN:
-          loss, host_call, scaffold = (
+          loss, host_call, scaffold, training_hooks = (
               _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn))
           host_ops = host_call.create_tpu_hostcall()
           if host_ops is None:
@@ -2447,7 +2494,9 @@ class TPUEstimator(estimator_lib.Estimator):
                   enqueue_ops,
                   host_ops,
                   run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator)),
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode],
+              ),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
@@ -2460,6 +2509,9 @@ class TPUEstimator(estimator_lib.Estimator):
               self._config.tpu_config.iterations_per_loop)
           hooks.append(examples_hook)
 
+          if training_hooks:
+            hooks.extend(training_hooks)
+
           chief_hooks = []
           if (self._config.save_checkpoints_secs or
               self._config.save_checkpoints_steps):
@@ -2471,6 +2523,7 @@ class TPUEstimator(estimator_lib.Estimator):
             checkpoint_hook._set_steps_per_run(   # pylint: disable=protected-access
                 self._config.tpu_config.iterations_per_loop)
             chief_hooks.append(checkpoint_hook)
+
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -2490,7 +2543,7 @@ class TPUEstimator(estimator_lib.Estimator):
               scaffold=scaffold)
 
         if mode == model_fn_lib.ModeKeys.EVAL:
-          total_loss, host_calls, scaffold = _eval_on_tpu_system(
+          total_loss, host_calls, scaffold, eval_hooks = _eval_on_tpu_system(
               ctx, model_fn_wrapper, dequeue_fn)
           iterations_per_loop_var = _create_or_get_iterations_per_loop()
           mean_loss = math_ops.div(total_loss,
@@ -2530,9 +2583,13 @@ class TPUEstimator(estimator_lib.Estimator):
                   enqueue_ops,
                   eval_update_ops + host_ops,
                   run_infeed_loop_on_coordinator=(
-                      run_infeed_loop_on_coordinator)),
+                      run_infeed_loop_on_coordinator),
+                  rendezvous=self._rendezvous[mode]),
           ] + input_hooks
 
+          if eval_hooks:
+            hooks.extend(eval_hooks)
+
           return model_fn_lib.EstimatorSpec(
               mode,
               loss=mean_loss,
@@ -2543,8 +2600,9 @@ class TPUEstimator(estimator_lib.Estimator):
         # Predict
         assert mode == model_fn_lib.ModeKeys.PREDICT
 
-        dummy_predict_op, host_calls, scaffold = _predict_on_tpu_system(
-            ctx, model_fn_wrapper, dequeue_fn)
+        (dummy_predict_op, host_calls,
+         scaffold, prediction_hooks) = _predict_on_tpu_system(
+             ctx, model_fn_wrapper, dequeue_fn)
         with ops.control_dependencies([dummy_predict_op]):
           internal_ops_to_run = _sync_variables_ops()
           with ops.control_dependencies(internal_ops_to_run):
@@ -2596,10 +2654,13 @@ class TPUEstimator(estimator_lib.Estimator):
 
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
-            TPUInfeedOutfeedSessionHookForPrediction(ctx, enqueue_ops,
-                                                     host_ops),
+            TPUInfeedOutfeedSessionHookForPrediction(
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode]),
         ] + input_hooks
 
+        if prediction_hooks:
+          hooks.extend(prediction_hooks)
+
         return model_fn_lib.EstimatorSpec(
             mode,
             prediction_hooks=hooks,
@@ -2683,8 +2744,8 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_eval_step, host_calls, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn))
+  (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn)
 
   def multi_tpu_eval_steps_on_single_shard():
     return training_loop.repeat(
@@ -2699,15 +2760,16 @@ def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_calls, scaffold
+  return loss, host_calls, scaffold, captured_eval_hooks.get()
 
 
 def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
 
-  single_tpu_train_step, host_call, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
+  (single_tpu_train_step, host_call, captured_scaffold_fn,
+   captured_training_hooks) = (
+       model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn))
 
   def multi_tpu_train_steps_on_single_shard():
     return training_loop.repeat(
@@ -2722,15 +2784,16 @@ def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       device_assignment=ctx.device_assignment)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return loss, host_call, scaffold
+  return loss, host_call, scaffold, captured_training_hooks.get()
 
 
 def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   num_cores = ctx.num_cores
 
-  single_tpu_predict_step, host_calls, captured_scaffold_fn = (
-      model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn))
+  (single_tpu_predict_step, host_calls, captured_scaffold_fn,
+   captured_predict_hooks
+  ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn)
 
   def multi_tpu_predict_steps_on_single_shard():
 
@@ -2750,7 +2813,7 @@ def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
       outputs_from_all_shards=False)
 
   scaffold = _get_scaffold(captured_scaffold_fn)
-  return dummy_predict_op, host_calls, scaffold
+  return dummy_predict_op, host_calls, scaffold, captured_predict_hooks.get()
 
 
 def _wrap_computation_in_while_loop(device, op_fn):
diff --git a/tensorflow/contrib/verbs/rdma_mgr.cc b/tensorflow/contrib/verbs/rdma_mgr.cc
index 9cb3d1fbbfdbc6d85a7a9799bd82438f0bf70c4f..3cb5e61facf860f2740935f66bf548096296280f 100644
--- a/tensorflow/contrib/verbs/rdma_mgr.cc
+++ b/tensorflow/contrib/verbs/rdma_mgr.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
+#include "tensorflow/core/common_runtime/pool_allocator.h"
 #include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
@@ -255,28 +256,25 @@ void MRDeleter(ibv_mr* mr) {
   }
 }
 
-// TODO(byronyi): remove this class duplicated from the one in
-// common/runtime/gpu/pool_allocator.h when it is available in common_runtime
-class BasicCPUAllocator : public SubAllocator {
- public:
-  ~BasicCPUAllocator() override {}
-
-  void* Alloc(size_t alignment, size_t num_bytes) override {
-    return port::AlignedMalloc(num_bytes, alignment);
-  }
-  void Free(void* ptr, size_t) override { port::AlignedFree(ptr); }
-};
-
 // TODO(byronyi): remove this class and its registration when the default
-// cpu_allocator() returns visitable allocator
+// cpu_allocator() returns visitable allocator, or cpu_allocator() is no
+// longer in use.
 class BFCRdmaAllocator : public BFCAllocator {
  public:
   BFCRdmaAllocator()
-      : BFCAllocator(new BasicCPUAllocator(), 1LL << 36, true, "cpu_rdma_bfc") {
+      : BFCAllocator(new BasicCPUAllocator(port::kNUMANoAffinity), 1LL << 36,
+                     true, "cpu_rdma_bfc") {}
+};
+class BFCRdmaAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() { return new BFCRdmaAllocator; }
+
+  SubAllocator* CreateSubAllocator(int numa_node) {
+    return new BasicCPUAllocator(numa_node);
   }
 };
 
-REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocator);
+REGISTER_MEM_ALLOCATOR("BFCRdmaAllocator", 101, BFCRdmaAllocatorFactory);
 
 void RdmaMgr::InitAllocators() {
   RdmaMemoryMgr::Singleton().pd_ = rdma_adapter_->pd_;
@@ -284,8 +282,8 @@ void RdmaMgr::InitAllocators() {
   Allocator* allocators[] = {
 #if GOOGLE_CUDA
     GPUProcessState::singleton()->GetCUDAHostAllocator(0),
-    ProcessState::singleton()->GetCPUAllocator(0),
 #endif  // GOOGLE_CUDA
+    ProcessState::singleton()->GetCPUAllocator(0),
     cpu_allocator(),
   };
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 514713bb96735e40365b2daf50eee8906b0bed98..35a112e8340ccee1f27fb1cd44227a37bff5bacd 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -662,6 +662,7 @@ cc_library(
         "lib/random/random_distributions.h",
         "lib/random/simple_philox.h",
         "lib/strings/numbers.h",
+        "lib/strings/proto_serialization.h",
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
@@ -883,6 +884,16 @@ cc_library(
     copts = tf_copts(),
 )
 
+tf_cc_test(
+    name = "stats_calculator_test",
+    srcs = ["util/stats_calculator_test.cc"],
+    deps = [
+        ":stats_calculator_portable",
+        ":test",
+        ":test_main",
+    ],
+)
+
 cc_library(
     name = "overflow",
     hdrs = ["util/overflow.h"],
@@ -1645,6 +1656,7 @@ cc_library(
     copts = tf_copts(android_optimization_level_override = None) + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
+    linkopts = if_android(["-lz"]),
     tags = [
         "manual",
         "notap",
@@ -1668,6 +1680,7 @@ cc_library(
     copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
         "-DSUPPORT_SELECTIVE_REGISTRATION",
     ],
+    linkopts = if_android(["-lz"]),
     tags = [
         "manual",
         "notap",
@@ -2911,6 +2924,14 @@ tf_cuda_library(
     ] + tf_additional_device_tracer_deps(),
 )
 
+cc_library(
+    name = "session_ref",
+    srcs = ["common_runtime/session_ref.cc"],
+    hdrs = ["common_runtime/session_ref.h"],
+    copts = tf_copts(),
+    deps = [":core_cpu_base"],
+)
+
 cc_library(
     name = "gpu_id",
     hdrs = [
@@ -3226,6 +3247,7 @@ tf_cc_tests(
         ":test",
         ":test_main",
         "//third_party/eigen3",
+        "@zlib_archive//:zlib",
     ],
 )
 
@@ -3735,7 +3757,6 @@ tf_cc_tests_gpu(
         "common_runtime/gpu/gpu_bfc_allocator_test.cc",
         "common_runtime/gpu/gpu_device_test.cc",
         "common_runtime/gpu/gpu_id_manager_test.cc",
-        "common_runtime/gpu/gpu_event_mgr_test.cc",
         "common_runtime/gpu/pool_allocator_test.cc",
     ],
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -3759,6 +3780,23 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_cc_test_gpu(
+    name = "gpu_event_mgr_test",
+    srcs = ["common_runtime/gpu/gpu_event_mgr_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "gpu_device_unified_memory_test",
     size = "small",
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
index 3b3a274df5d51706a07b9165df0bb5bfda8de776..2b58969da255d173c143639bf017d9d156a60619 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -51,7 +51,7 @@ For example, say we want to update 4 scattered elements to a rank-1 tensor to
 8 elements. In Python, that update would look like this:
 
 ```python
-    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
     indices = tf.constant([[4], [3], [1] ,[7]])
     updates = tf.constant([9, 10, 11, 12])
     update = tf.scatter_nd_add(ref, indices, updates)
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 58753a651a17d163da5e0e5affb4025ce9530bbd..ad1c527b010c6469358a96f9b28ec0fd6fd6896c 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -32,8 +32,12 @@ slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
 @{tf.gather_nd} operator which extracts values or slices from a given tensor.
 
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
 **WARNING**: The order in which updates are applied is nondeterministic, so the
-output will be nondeterministic if `indices` contains duplicates.
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
 
 `indices` is an integer tensor containing indices into a new tensor of shape
 `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index 9cda17867bafd6d33afdf020f0f56d00cd72328f..3bf0532491ad3ae6283282df40df1a080cd1f816 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -155,10 +155,6 @@ bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
 
   region_manager_.set_handle(c->ptr, h);
 
-  // TODO(vrv): Try to merge this new region with an existing region,
-  // if the address space is contiguous, to avoid fragmentation
-  // across regions.
-
   // Insert the chunk into the right bin.
   InsertFreeChunkIntoBin(h);
 
@@ -465,49 +461,33 @@ void BFCAllocator::FreeAndMaybeCoalesce(BFCAllocator::ChunkHandle h) {
   Chunk* c = ChunkFromHandle(h);
   CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
 
-  // Mark the chunk as no longer in use
+  // Mark the chunk as no longer in use.
   c->allocation_id = -1;
 
   // Updates the stats.
   stats_.bytes_in_use -= c->size;
 
-  // This chunk is no longer in-use, consider coalescing the chunk
-  // with adjacent chunks.
-  ChunkHandle chunk_to_reassign = h;
-
-  // If the next chunk is free, coalesce the two
-  if (c->next != kInvalidChunkHandle) {
-    Chunk* cnext = ChunkFromHandle(c->next);
-    if (!cnext->in_use()) {
-      //      VLOG(8) << "Chunk at " << cnext->ptr << " merging with c " <<
-      //      c->ptr;
-
-      chunk_to_reassign = h;
+  ChunkHandle coalesced_chunk = h;
 
-      // Deletes c->next
-      RemoveFreeChunkFromBin(c->next);
-      Merge(h, ChunkFromHandle(h)->next);
-    }
+  // If the next chunk is free, merge it into c and delete it.
+  if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
+    // VLOG(8) << "Merging c->next " << ChunkFromHandle(c->next)->ptr
+    //         << " with c " << c->ptr;
+    RemoveFreeChunkFromBin(c->next);
+    Merge(h, c->next);
   }
 
-  // If the previous chunk is free, coalesce the two
-  c = ChunkFromHandle(h);
-  if (c->prev != kInvalidChunkHandle) {
-    Chunk* cprev = ChunkFromHandle(c->prev);
-    if (!cprev->in_use()) {
-      //      VLOG(8) << "Chunk at " << c->ptr << " merging into c->prev "
-      //       << cprev->ptr;
-
-      chunk_to_reassign = c->prev;
+  // If the previous chunk is free, merge c into it and delete c.
+  if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
+    // VLOG(8) << "Merging c " << c->ptr << " into c->prev "
+    //         << ChunkFromHandle(c->prev)->ptr;
 
-      // Deletes c
-      RemoveFreeChunkFromBin(c->prev);
-      Merge(ChunkFromHandle(h)->prev, h);
-      c = ChunkFromHandle(h);
-    }
+    coalesced_chunk = c->prev;
+    RemoveFreeChunkFromBin(c->prev);
+    Merge(c->prev, h);
   }
 
-  InsertFreeChunkIntoBin(chunk_to_reassign);
+  InsertFreeChunkIntoBin(coalesced_chunk);
 }
 
 void BFCAllocator::AddAllocVisitor(Visitor visitor) {
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index 52aedb1e9c2865418bd93af8d75053d1c2280b2e..580e61e2ea98daf868737eb0aa976918b390e3dd 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -88,11 +88,20 @@ class BFCAllocator : public VisitableAllocator {
   static const int kInvalidBinNum = -1;
   static const int kNumBins = 21;
 
-  // Chunks point to memory.  Their prev/next pointers form a
-  // doubly-linked list of addresses sorted by base address that
-  // must be contiguous.  Chunks contain information about whether
-  // they are in use or whether they are free, and contain a pointer
-  // to the bin they are in.
+  // A Chunk points to a piece of memory that's either entirely free or entirely
+  // in use by one user memory allocation.
+  //
+  // An AllocationRegion's memory is split up into one or more disjoint Chunks,
+  // which together cover the whole region without gaps.  Chunks participate in
+  // a doubly-linked list, and the prev/next pointers point to the physically
+  // adjacent chunks.
+  //
+  // Since a chunk cannot be partially in use, we may need to split a free chunk
+  // in order to service a user allocation.  We always merge adjacent free
+  // chunks.
+  //
+  // Chunks contain information about whether they are in use or whether they
+  // are free, and contain a pointer to the bin they are in.
   struct Chunk {
     size_t size = 0;  // Full size of buffer.
 
@@ -177,8 +186,12 @@ class BFCAllocator : public VisitableAllocator {
   static const size_t kMinAllocationBits = 8;
   static const size_t kMinAllocationSize = 1 << kMinAllocationBits;
 
-  // AllocationRegion maps pointers to ChunkHandles for a single
-  // contiguous memory region.
+  // BFCAllocator allocates memory into a collection of disjoint
+  // AllocationRegions.  Each AllocationRegion corresponds to one call to
+  // SubAllocator::Alloc().
+  //
+  // An AllocationRegion contains one or more Chunks, covering all of its
+  // memory.  Its primary job is to map a pointers to ChunkHandles.
   //
   // This class is thread-compatible.
   class AllocationRegion {
@@ -191,18 +204,14 @@ class BFCAllocator : public VisitableAllocator {
       DCHECK_EQ(0, memory_size % kMinAllocationSize);
       const size_t n_handles =
           (memory_size + kMinAllocationSize - 1) / kMinAllocationSize;
-      handles_ = new ChunkHandle[n_handles];
+      handles_.reset(new ChunkHandle[n_handles]);
       for (size_t i = 0; i < n_handles; i++) {
         handles_[i] = kInvalidChunkHandle;
       }
     }
 
-    AllocationRegion() {}
-
-    ~AllocationRegion() { delete[] handles_; }
-
+    AllocationRegion() = default;
     AllocationRegion(AllocationRegion&& other) { Swap(other); }
-
     AllocationRegion& operator=(AllocationRegion&& other) {
       Swap(other);
       return *this;
@@ -241,7 +250,7 @@ class BFCAllocator : public VisitableAllocator {
     // Array of size "memory_size / kMinAllocationSize".  It is
     // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle
     // for the memory allocation represented by "p"
-    ChunkHandle* handles_ = nullptr;
+    std::unique_ptr<ChunkHandle[]> handles_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion);
   };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 4c670820be1d513e4304d649cef076f44686feb4..d1fd930d25f1e7f6700373419ab75c0e8a6c7094 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -235,7 +235,11 @@ void DirectSession::SchedClosure(thread::ThreadPool* pool,
   // safe given the reasoning above.
   c();
 #else
-  pool->Schedule(std::move(c));
+  if (pool != nullptr) {
+    pool->Schedule(std::move(c));
+  } else {
+    c();
+  }
 #endif  // __ANDROID__
 }
 
@@ -522,8 +526,9 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
     }
   }
 
-  if (run_options.inter_op_thread_pool() < 0 ||
-      run_options.inter_op_thread_pool() >= thread_pools_.size()) {
+  if (run_options.inter_op_thread_pool() < -1 ||
+      run_options.inter_op_thread_pool() >=
+          static_cast<int32>(thread_pools_.size())) {
     run_state.executors_done.Notify();
     delete barrier;
     return errors::InvalidArgument("Invalid inter_op_thread_pool: ",
@@ -548,7 +553,19 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   thread::ThreadPool* pool =
-      thread_pools_[run_options.inter_op_thread_pool()].first;
+      run_options.inter_op_thread_pool() >= 0
+          ? thread_pools_[run_options.inter_op_thread_pool()].first
+          : nullptr;
+
+  if (pool == nullptr) {
+    // We allow using the caller thread only when having a single executor
+    // specified.
+    if (executors_and_keys->items.size() > 1) {
+      pool = thread_pools_[0].first;
+    } else {
+      VLOG(1) << "Executing Session::Run() synchronously!";
+    }
+  }
 
   Executor::Args::Runner default_runner = [this,
                                            pool](Executor::Args::Closure c) {
@@ -700,7 +717,8 @@ Status DirectSession::Run(const RunOptions& run_options,
   // Receive outputs.
   if (outputs) {
     std::vector<Tensor> sorted_outputs;
-    const Status s = call_frame.ConsumeRetvals(&sorted_outputs);
+    const Status s = call_frame.ConsumeRetvals(
+        &sorted_outputs, /* allow_dead_tensors = */ false);
     if (errors::IsInternal(s)) {
       return errors::InvalidArgument(s.error_message());
     } else if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index 142d6131293478543222c593f5f47b641ba39bcc..4b51b20bb198b8266f20ffef238c055e510cf737 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <thread>
 #include <unordered_map>
 #include <vector>
 
@@ -896,6 +897,125 @@ TEST(DirectSessionTest, FetchMultipleTimes) {
   }
 }
 
+TEST(DirectSessionTest, MultipleFeedTestSomeSyncRun) {
+  GraphDef def;
+  Graph g(OpRegistry::Global());
+  RunOptions run_options;
+  run_options.set_inter_op_thread_pool(-1);
+
+  Tensor first_value(DT_FLOAT, TensorShape({}));
+  first_value.scalar<float>()() = 1.0;
+  Node* first_const = test::graph::Constant(&g, first_value);
+  Node* first_identity = test::graph::Identity(&g, first_const);
+
+  Tensor second_value(DT_FLOAT, TensorShape({}));
+  second_value.scalar<float>()() = 2.0;
+  Node* second_const = test::graph::Constant(&g, second_value);
+  Node* second_identity = test::graph::Identity(&g, second_const);
+
+  test::graph::ToGraphDef(&g, &def);
+
+  auto session = CreateSession();
+  ASSERT_TRUE(session != nullptr);
+  TF_ASSERT_OK(session->Create(def));
+
+  std::vector<Tensor> outputs;
+
+  // Fetch without feeding.
+  Status s = session->Run(
+      run_options, {},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs, nullptr);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(1.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(2.0, outputs[1].flat<float>()(0));
+
+  s = session->Run(
+      {}, {second_identity->name() + ":0", first_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(2.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(1.0, outputs[1].flat<float>()(0));
+
+  Tensor value_11(DT_FLOAT, TensorShape({}));
+  value_11.scalar<float>()() = 11.0;
+  Tensor value_22(DT_FLOAT, TensorShape({}));
+  value_22.scalar<float>()() = 22.0;
+
+  // Feed [first_const, second_const]
+  s = session->Run(
+      {{first_const->name(), value_11}, {second_const->name(), value_22}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [second_const, first_const]
+  s = session->Run(
+      {{second_const->name(), value_22}, {first_const->name(), value_11}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(2, outputs.size());
+  ASSERT_EQ(11.0, outputs[0].flat<float>()(0));
+  ASSERT_EQ(22.0, outputs[1].flat<float>()(0));
+
+  // Feed [first_const, first_const]
+  s = session->Run(
+      run_options,
+      {{first_const->name(), value_11}, {first_const->name(), value_22}},
+      {first_identity->name() + ":0", second_identity->name() + ":0"}, {},
+      &outputs, nullptr);
+  EXPECT_TRUE(errors::IsInvalidArgument(s));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(), "fed more than once"));
+}
+
+REGISTER_OP("ThreadID").Input("x: int64").Output("y: int64").Doc(R"doc(
+ThreadID returns the thread ID that called compute.
+
+x: int64
+y: int64
+)doc");
+
+// The ThreadID kernel returns the thread ID that executed Compute.
+class ThreadIDOp : public OpKernel {
+ public:
+  explicit ThreadIDOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* out_tensor = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output("y", TensorShape({}), &out_tensor));
+    std::hash<std::thread::id> hasher;
+    out_tensor->scalar<int64>()() =
+        static_cast<int64>(hasher(std::this_thread::get_id()));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("ThreadID").Device(DEVICE_CPU), ThreadIDOp);
+
+TEST(DirectSessionTest, SessionSyncRun) {
+  Graph g(OpRegistry::Global());
+  Tensor vx(DT_INT64, TensorShape({}));
+  vx.scalar<int64>()() = 17;
+  Node* x = test::graph::Constant(&g, vx);
+  Node* y = test::graph::Unary(&g, "ThreadID", x);
+  GraphDef def;
+  test::graph::ToGraphDef(&g, &def);
+  auto sess = CreateSession();
+  TF_ASSERT_OK(sess->Create(def));
+  std::vector<Tensor> outputs;
+  RunOptions run_opts;
+  run_opts.set_inter_op_thread_pool(-1);
+  auto s = sess->Run(run_opts, {}, {y->name() + ":0"}, {}, &outputs, nullptr);
+
+  std::hash<std::thread::id> hasher;
+  EXPECT_EQ(static_cast<int64>(hasher(std::this_thread::get_id())),
+            static_cast<int64>(outputs[0].scalar<int64>()()));
+}
+
 REGISTER_OP("Darth").Input("x: float").Output("y: float").Doc(R"doc(
 Darth promises one return value.
 
@@ -1400,6 +1520,7 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
   p = options.config.add_session_inter_op_thread_pool();
   if (use_global_pools) p->set_global_name("small pool");
   p->set_num_threads(1);
+  const int kSyncPool = -1;
   const int kLargePool = 0;
   const int kSmallPool = 1;
 
@@ -1442,7 +1563,11 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
           EXPECT_FLOAT_EQ(1.2, flat(0));
           num_done.fetch_add(1);
         };
-        tp->Schedule(fn);
+        if (tp != nullptr) {
+          tp->Schedule(fn);
+        } else {
+          fn();
+        }
       };
 
   // For blocking states:
@@ -1463,9 +1588,10 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
 
   tp1 = new thread::ThreadPool(Env::Default(), "tp1", 5);
 
-  // Launch 2 session run calls. Neither will finish until the blocking op is
+  // Launch a session run call. It will not finish until the blocking op is
   // unblocked, because it is using all threads in the small pool.
   add_session_run_call(tp1, y, kSmallPool);
+
   blocking_op_state->AwaitState(1);  // Wait for the blocking op to Compute.
 
   // These will block on <BlockingOpState>.
@@ -1484,10 +1610,15 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
   delete tp2;
   EXPECT_EQ(kUnblockedThreads, num_done.load());
 
+  // Launch a session call using this thread. This will finish as it runs
+  // synchronously in this thread.
+  add_session_run_call(nullptr, x, kSyncPool);
+
   // Unblock the blocked op and wait for the blocked functions to finish.
   blocking_op_state->MoveToState(1, 2);
   delete tp1;
-  EXPECT_EQ(kUnblockedThreads + kBlockedThreads + 1, num_done.load());
+
+  EXPECT_EQ(kUnblockedThreads + kBlockedThreads + 1 + 1, num_done.load());
   delete blocking_op_state;
   blocking_op_state = nullptr;
 }
@@ -1532,7 +1663,7 @@ TEST(DirectSessionTest, TestSessionInterOpThreadsInvalidOptions) {
   {
     std::unique_ptr<Session> session(NewSession(options));
     TF_ASSERT_OK(session->Create(def));
-    for (int pool_num = -1; pool_num <= 1; pool_num += 2) {
+    for (int pool_num = -2; pool_num <= 1; pool_num += 3) {
       RunOptions run_options;
       run_options.set_inter_op_thread_pool(pool_num);
       std::vector<Tensor> outputs;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 1c5e9a2a31e7313a91b1e9c72345291fd8ad9db4..5e0f0a45f83cbbe515a5d13aa4337773ba96cd7f 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -17,8 +17,20 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
+namespace {
+
+bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
+  bool val;
+  if (ReadBoolFromEnvVar(env_var_name, default_val, &val).ok()) {
+    return val;
+  }
+  return default_val;
+}
+
+}  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
                            ContextDevicePlacementPolicy default_policy,
@@ -34,7 +46,8 @@ EagerContext::EagerContext(const SessionOptions& opts,
           local_device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION,
           &func_lib_def_, {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
-      async_default_(async) {
+      async_default_(async),
+      use_send_tensor_rpc_(false) {
   InitDeviceMapAndAsync();
   if (opts.config.inter_op_parallelism_threads() > 0) {
     runner_ = [this](std::function<void()> closure) {
@@ -66,7 +79,9 @@ EagerContext::EagerContext(
       remote_device_manager_(std::move(remote_device_manager)),
       server_(std::move(server)),
       remote_eager_workers_(std::move(remote_eager_workers)),
-      remote_contexts_(remote_contexts) {
+      remote_contexts_(remote_contexts),
+      use_send_tensor_rpc_(
+          ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", false)) {
   InitDeviceMapAndAsync();
 }
 #endif
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d0563280bf163ae0b28354bbeddcc4f5335026a6..4a180e074d0402e683415fd2507a832772f24958 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -182,6 +182,11 @@ class EagerContext {
 #ifndef __ANDROID__
   Status GetClientAndContextID(Device* device, eager::EagerClient** client,
                                uint64* context_id);
+
+  // If true, then tensors should be shipped across processes via the
+  // EagerService.SendTensor RPC. If false, _Send/_Recv ops should be used
+  // instead (which in-turn use WorkerService.RecvTensor RPCs.
+  bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
 #endif
  private:
   void InitDeviceMapAndAsync();
@@ -239,16 +244,18 @@ class EagerContext {
 
   const std::unique_ptr<DeviceMgr> remote_device_manager_;
 
+#ifndef __ANDROID__
   // The server_ is not const since we release it when the context is destroyed.
   // Therefore the server_ object is not marked as const (even though it should
   // be).
-#ifndef __ANDROID__
   std::unique_ptr<ServerInterface> server_;
   const std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
 
   const gtl::FlatMap<string, uint64> remote_contexts_;
   gtl::FlatMap<Device*, std::pair<eager::EagerClient*, uint64>>
       device_to_client_cache_;
+
+  const bool use_send_tensor_rpc_;
 #endif
 };
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 5ea814ed4efea6143cd7ee89dbe50e68469b1f36..f97fa4fadc76436f2dfe7ca24b8f0189ee376b20 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -88,6 +88,8 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
   TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
   const Device* actual_device =
       handle_device == nullptr ? ctx->HostCPU() : handle_device;
+  const Device* op_device =
+      op->Device() == nullptr ? ctx->HostCPU() : op->Device();
 
   if (expected_device != actual_device) {
     switch (ctx->GetDevicePlacementPolicy()) {
@@ -106,8 +108,8 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
             " cannot compute ",
             op->Name(), " as input #", i, " was expected to be on ",
             expected_device->name(), " but is actually on ",
-            actual_device->name(), " (operation running on ",
-            op->Device()->name(), ")",
+            actual_device->name(), " (operation running on ", op_device->name(),
+            ")",
             " Tensors can be copied explicitly using .gpu() or .cpu() "
             "methods,"
             " or transparently copied by using tf.enable_eager_execution("
@@ -118,7 +120,7 @@ Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
         LOG(WARNING) << "before computing " << op->Name() << " input #" << i
                      << " was expected to be on " << expected_device->name()
                      << " but is actually on " << actual_device->name()
-                     << " (operation running on " << op->Device()->name()
+                     << " (operation running on " << op_device->name()
                      << "). This triggers a copy which can be a performance "
                         "bottleneck.";
         break;
@@ -173,7 +175,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
     tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
-          "cannot compute ", op->Name(), " as input #", i,
+          "cannot compute ", op->Name(), " as input #", i, "(zero-based)",
           " was expected to be a ", DataTypeString(kernel->input_type(i)),
           " tensor but is a ", DataTypeString(handle->dtype), " tensor");
     }
@@ -446,6 +448,14 @@ bool IsLocal(EagerContext* ctx, tensorflow::Device* d) {
   return ctx->local_device_mgr()->LookupDevice(d->name(), &tmp).ok();
 }
 
+bool OnSameTask(EagerContext* ctx, Device* first, Device* second) {
+  if (first == nullptr) first = ctx->HostCPU();
+  if (second == nullptr) second = ctx->HostCPU();
+  return first->parsed_name().job == second->parsed_name().job &&
+         first->parsed_name().replica == second->parsed_name().replica &&
+         first->parsed_name().task == second->parsed_name().task;
+}
+
 Status EagerLocalExecute(EagerOperation* op,
                          gtl::InlinedVector<TensorHandle*, 2>* retvals,
                          int* num_retvals) {
@@ -583,6 +593,87 @@ Status EagerLocalExecute(EagerOperation* op,
   return status;
 }
 
+std::function<void()> GetRemoteTensorDestructor(
+    EagerContext* ctx, eager::EagerClient* eager_client, uint64 context_id,
+    uint64 op_id, int output_num) {
+  return [ctx, eager_client, context_id, op_id, output_num]() {
+    std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
+    request->set_context_id(context_id);
+
+    auto* handle_to_decref = request->add_queue()->mutable_handle_to_decref();
+    handle_to_decref->set_op_id(op_id);
+    handle_to_decref->set_output_num(output_num);
+
+    if (ctx->Async()) {
+      tensorflow::uint64 id = ctx->NextId();
+      auto* node =
+          new eager::RemoteExecuteNode(id, std::move(request), eager_client);
+      ctx->ExecutorAdd(node);
+    } else {
+      eager::EnqueueRequest* actual_request = request.release();
+      eager::EnqueueResponse* response = new eager::EnqueueResponse;
+      eager_client->EnqueueAsync(
+          actual_request, response,
+          [actual_request, response](const tensorflow::Status& s) {
+            delete actual_request;
+            delete response;
+          });
+    }
+
+    return tensorflow::Status::OK();
+  };
+}
+
+// When !ctx->UseSendTensorRPC(), then tensors are shipped between remote
+// devices by the receiver invoking the WorkerService.RecvTensor RPC *on the
+// sender* (Rendezvous::RecvAsync() invoked by the _Recv kernel).
+//
+// However, in some configurations the node that has the tensor to be copied
+// isn't running a server (WorkerService RPC interface). For such cases,
+// this function enables sending tensors using the EagerService.SendTensor RPC
+// *on the receiver*.
+Status EagerRemoteSendTensor(EagerContext* ctx, TensorHandle* h,
+                             Device* recv_device, TensorHandle** result) {
+  eager::EagerClient* eager_client;
+  uint64 context_id;
+  TF_RETURN_IF_ERROR(
+      ctx->GetClientAndContextID(recv_device, &eager_client, &context_id));
+
+  eager::SendTensorRequest request;
+  eager::SendTensorResponse response;
+
+  request.set_context_id(context_id);
+  request.set_op_id(ctx->NextId());
+  request.set_device_name(recv_device->name());
+
+  const Tensor* tensor;
+  TF_RETURN_IF_ERROR(h->Tensor(&tensor));
+  tensor->AsProtoTensorContent(request.add_tensors());
+
+  const tensorflow::uint64 id = request.op_id();
+
+  // TODO(nareshmodi): support making this call async.
+  Notification n;
+  Status status;
+  eager_client->SendTensorAsync(&request, &response,
+                                [&n, &status](const Status& s) {
+                                  status = s;
+                                  n.Notify();
+                                });
+  n.WaitForNotification();
+  if (!status.ok()) return status;
+
+  std::function<void()> destructor =
+      GetRemoteTensorDestructor(ctx, eager_client, context_id, id, 0);
+
+  *result = new TensorHandle(id, /*output_num=*/0, /*remote_shape_node_id=*/0,
+                             tensor->dtype(), std::move(destructor),
+                             recv_device, recv_device, ctx);
+  (*result)->SetRemoteShape(MakeUnique<TensorShape>(tensor->shape()));
+
+  return Status::OK();
+}
+
 Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
                           int* num_retvals) {
 #ifdef __ANDROID__
@@ -596,15 +687,21 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(
       ctx->GetClientAndContextID(op->Device(), &eager_client, &context_id));
 
-  eager::EnqueueRequest request;
+  std::unique_ptr<eager::EnqueueRequest> request(new eager::EnqueueRequest);
   eager::EnqueueResponse response;
 
-  auto* remote_op = request.add_queue()->mutable_operation();
+  request->set_context_id(context_id);
+
+  auto* remote_op = request->add_queue()->mutable_operation();
 
   for (int i = 0; i < op->Inputs().size(); i++) {
     tensorflow::Device* input_device;
     TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
-    if (op->Device() != input_device) {
+    if (op->Device() != input_device &&
+        // If the expected and actual devices are on the same task, don't
+        // explicitly copy, and instead depend on the copy to happen locally
+        // when the op is executed on the device.
+        !OnSameTask(ctx, op->Device(), input_device)) {
       // TODO(b/110044833): It's possible the same tensor gets copied to the
       // remote device repeatedly.
       TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
@@ -629,8 +726,6 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(op->Device()->name());
 
-  request.set_context_id(context_id);
-
   DataTypeVector output_dtypes;
   TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
 
@@ -652,32 +747,11 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   for (int i = 0; i < *num_retvals; i++) {
     // TODO(nareshmodi): Change the callback to instead add the decref to a list
     // of pending decrefs that we can send as a batch with the next execute.
-    std::function<void()> callback = [ctx, eager_client, context_id, id, i]() {
-      eager::EnqueueRequest request;
-      request.set_context_id(context_id);
-
-      auto* handle_to_decref = request.add_queue()->mutable_handle_to_decref();
-      handle_to_decref->set_op_id(id);
-      handle_to_decref->set_output_num(i);
-
-      if (ctx->Async()) {
-        tensorflow::uint64 id = ctx->NextId();
-        auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
-        ctx->ExecutorAdd(node);
-      } else {
-        Notification n;
-        eager::EnqueueResponse response;
-        eager_client->EnqueueAsync(
-            &request, &response,
-            [&n](const tensorflow::Status& s) { n.Notify(); });
-        n.WaitForNotification();
-      }
-
-      return tensorflow::Status::OK();
-    };
+    std::function<void()> destructor =
+        GetRemoteTensorDestructor(ctx, eager_client, context_id, id, i);
 
     retvals[i] = new TensorHandle(remote_op->id(), i, remote_node_id,
-                                  output_dtypes[i], std::move(callback),
+                                  output_dtypes[i], std::move(destructor),
                                   op_device, op_device, op->EagerContext());
   }
 
@@ -691,7 +765,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
     }
     // Unable to capture via std::move, so bind instead.
     auto* node = new eager::RemoteExecuteNode(
-        remote_node_id, request, eager_client, op->Inputs(),
+        remote_node_id, std::move(request), eager_client, op->Inputs(),
         std::bind(
             [](const gtl::InlinedVector<TensorHandle*, 2>& retvals,
                const Status& status, const eager::EnqueueResponse& response) {
@@ -708,7 +782,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
   } else {
     Notification n;
     Status status;
-    eager_client->EnqueueAsync(&request, &response,
+    eager_client->EnqueueAsync(request.get(), &response,
                                [&n, &status](const Status& s) {
                                  status = s;
                                  n.Notify();
@@ -937,6 +1011,8 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
 
   if (sender_is_local && recver_is_local) {
     return LocalEagerCopyToDevice(h, ctx, recv_device, result);
+  } else if (ctx->UseSendTensorRPC() && sender_is_local && !recver_is_local) {
+    return EagerRemoteSendTensor(ctx, h, recv_device, result);
   } else {
     string wire_id = GetUniqueWireID();
 
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index a93cfa2ec54a379cf7da6347c49ac5e1a060268e..54bbe84b57bc0f574fa0566da6d2238b0f7e7082 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -746,6 +746,8 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
     rets_alloc_attrs.push_back(ret_alloc_attrs);
   }
 
+  bool allow_dead_tensors = opts.allow_dead_tensors;
+
   // The ProcFLR sends the arguments to the function from the source_device to
   // the target_device. So here we receive those arguments. Similarly, when the
   // computation is done and stored in *rets, we send the return values back
@@ -756,7 +758,7 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
       device_context, args_alloc_attrs, rendezvous, remote_args,
       [frame, remote_args, item, source_device, target_device,
        target_incarnation, rendezvous, device_context, rets, done, exec_args,
-       rets_alloc_attrs](const Status& status) {
+       rets_alloc_attrs, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->SetArgs(*remote_args);
@@ -769,13 +771,13 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args,
-            [frame, rets, done, source_device, target_device,
-             target_incarnation, rendezvous, device_context, remote_args,
-             exec_args, rets_alloc_attrs](const Status& status) {
+            *exec_args, [frame, rets, done, source_device, target_device,
+                         target_incarnation, rendezvous, device_context,
+                         remote_args, exec_args, rets_alloc_attrs,
+                         allow_dead_tensors](const Status& status) {
               Status s = status;
               if (s.ok()) {
-                s = frame->ConsumeRetvals(rets);
+                s = frame->ConsumeRetvals(rets, allow_dead_tensors);
               }
               delete frame;
               if (!s.ok()) {
@@ -859,14 +861,15 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  bool allow_dead_tensors = opts.allow_dead_tensors;
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
-      [frame, rets, done, exec_args](const Status& status) {
+      [frame, rets, done, exec_args, allow_dead_tensors](const Status& status) {
         Status s = status;
         if (s.ok()) {
-          s = frame->ConsumeRetvals(rets);
+          s = frame->ConsumeRetvals(rets, allow_dead_tensors);
         }
         delete frame;
         delete exec_args;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 1e837e9a7e7f7bf5c394a2725b47f4053ed911a5..120f480198a9f0313be437b5ca607b440eb9b883 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -1019,8 +1019,9 @@ TEST_F(FunctionLibraryRuntimeTest, Error_BadControlFlow) {
   DCHECK_EQ(x.dtype(), DT_INT32);
   Tensor y;
   HasError(InstantiateAndRun(flr0_, "InvalidControlFlow", {}, {x}, {&y}),
-           "The node 'add' has inputs from different frames. The input 'enter' "
-           "is in frame 'while'. The input 'i' is in frame ''.");
+           "{{node add}} has inputs from different frames. The input"
+           " {{node enter}} is in frame 'while'. The input {{node i}} is in"
+           " frame ''.");
 }
 
 TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
index 48984484760f86ce2edd9d84d85776e503a019b5..3c1c31aa732d373e76599cdc8fe8ae8561765c9c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.cc
@@ -15,11 +15,80 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 
+#include "tensorflow/core/platform/stacktrace.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
+namespace {
+// The EventMgr has 1 thread for the polling loop and one to execute
+// event callback functions. Issues for reconsideration:
+//  - Is this the right number of threads?
+//  - Should EventMgrs be shared between GPUDevices on a multi-GPU machine?
+static const int kNumThreads = 2;
+}  // namespace
+
+namespace gpu_event_mgr {
+class ThreadLabel {
+ public:
+  static const char* GetValue() { return value_; }
+
+  // v must be a static const because value_ will capture and use its value
+  // until reset or thread terminates.
+  static void SetValue(const char* v) { value_ = v; }
+
+ private:
+  static thread_local const char* value_;
+};
+thread_local const char* ThreadLabel::value_ = "";
+
+void WarnIfInCallback(std::function<void()> f) {
+  const char* label = ThreadLabel::GetValue();
+  if (label && !strcmp(label, "gpu_event_mgr")) {
+    if (f) {
+      f();
+    } else {
+      LOG(WARNING) << "Executing inside EventMgr callback thread: "
+                   << CurrentStackTrace();
+    }
+  }
+}
+
+void InitThreadpoolLabels(thread::ThreadPool* threadpool) {
+  static const char* label = "gpu_event_mgr";
+  mutex mu;
+  int init_count = 0;
+  condition_variable all_initialized;
+  int exit_count = 0;
+  condition_variable ready_to_exit;
+  const int num_threads = threadpool->NumThreads();
+  for (int i = 0; i < num_threads; ++i) {
+    threadpool->Schedule([num_threads, &mu, &init_count, &all_initialized,
+                          &exit_count, &ready_to_exit]() {
+      gpu_event_mgr::ThreadLabel::SetValue(label);
+      mutex_lock l(mu);
+      ++init_count;
+      if (init_count == num_threads) {
+        all_initialized.notify_all();
+      }
+      while (init_count < num_threads) {
+        all_initialized.wait(l);
+      }
+      if (++exit_count == num_threads) {
+        ready_to_exit.notify_all();
+      }
+    });
+  }
+  {
+    mutex_lock l(mu);
+    while (exit_count < num_threads) {
+      ready_to_exit.wait(l);
+    }
+  }
+}
+}  // namespace gpu_event_mgr
+
 EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
     : exec_(se),
       deferred_bytes_threshold_(gpu_options.deferred_deletion_bytes()
@@ -31,9 +100,8 @@ EventMgr::EventMgr(se::StreamExecutor* se, const GPUOptions& gpu_options)
       accumulated_stream_(nullptr),
       accumulated_tensors_(new TensorReferenceVector),
       accumulated_tensor_bytes_(0),
-      // threadpool_ has 1 thread for the polling loop, and one to execute
-      // event callback functions. Maybe we should have more?
-      threadpool_(Env::Default(), "GPU_Event_Manager", 2) {
+      threadpool_(Env::Default(), "GPU_Event_Manager", kNumThreads) {
+  gpu_event_mgr::InitThreadpoolLabels(&threadpool_);
   StartPollingLoop();
 }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
index b26f88a201c15720aa1ea5e3bbd135296d934f12..f0a109cc10847ebd6dbfd41dbf93a6f90341a61a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr.h
@@ -39,6 +39,25 @@ namespace tensorflow {
 
 class GPUOptions;
 
+// The callback provided to EventMgr::ThenExecute must not block or take a long
+// time.  If it does, performance may be impacted and GPU memory may be
+// exhausted.  This macro is for checking that an EventMgr thread is not
+// accidentally entering blocking parts of the code, e.g. the RPC subsystem.
+//
+// Intended use is something like
+//
+//   void RespondToAnRPC(Params* params) {
+//      WARN_IF_IN_EVENT_MGR_THREAD;
+//      if (params->status.ok()) { ...
+//
+namespace gpu_event_mgr {
+// Logs a stack trace if current execution thread belongs to this EventMgr
+// object.  If f is not nullptr, executes instead of  logging the stack trace.
+// trace.
+void WarnIfInCallback(std::function<void()> f);
+}  // namespace gpu_event_mgr
+#define WARN_IF_IN_EVENT_MGR_THREAD gpu_event_mgr::WarnIfInCallback(nullptr)
+
 // An object to keep track of pending Events in the StreamExecutor streams
 // and associated Tensors that cannot safely be deleted until the associated
 // Events are recorded.
@@ -74,6 +93,9 @@ class EventMgr {
     FreeMemory(to_free);
   }
 
+  // Execute func when all pending stream actions have completed.
+  // func must be brief and non-blocking since it executes in the one
+  // thread used for all such callbacks and also buffer deletions.
   inline void ThenExecute(se::Stream* stream, std::function<void()> func) {
     ToFreeVector to_free;
     {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index c5ff6c97a176a871e4fd47555f6ea8e3513ab8c2..d2adf699f524ef6771da6b0a41e7fc552d2bbdfa 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <atomic>
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -243,6 +244,28 @@ TEST(EventMgr, NonEmptyShutdown) {
   }
 }
 
+// Tests that WarnIfInCallback() triggers correctly.
+TEST(EventMgr, WarnIfInCallback) {
+  auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
+  EventMgr em(stream_exec, GPUOptions());
+  TEST_EventMgrHelper th(&em);
+  std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
+  CHECK(stream);
+  stream->Init();
+  bool hit = false;
+  gpu_event_mgr::WarnIfInCallback([&hit] { hit = true; });
+  EXPECT_FALSE(hit);
+  Notification note;
+  em.ThenExecute(stream.get(), [&hit, &note]() {
+    gpu_event_mgr::WarnIfInCallback([&hit, &note] {
+      hit = true;
+      note.Notify();
+    });
+  });
+  note.WaitForNotification();
+  EXPECT_TRUE(hit);
+}
+
 }  // namespace
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 29f702699f4370d2a36bc807410078b2e09abd5a..94e10dbfa267e63d1f04cf805368621efc99addc 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -22,7 +22,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include <cstdlib>
-#include <string>
 #include "tensorflow/core/common_runtime/bfc_allocator.h"
 #include "tensorflow/core/common_runtime/visitable_allocator.h"
 #include "tensorflow/core/lib/strings/numbers.h"
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 6781c87f6c27d76c4a964dbe200baebe8742be32..613470365d9691a83d178d812eea5bd734fe0fca 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -41,10 +41,8 @@ namespace {
 const StringPiece kColocationAttrNameStringPiece(kColocationAttrName);
 const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 
-// Returns a list of devices sorted by preferred type and then name
-// from 'devices' whose type is in 'supported_device_types'.  This
-// function searches the device types in 'supported_device_types' and
-// returns the subset of devices that match.
+// Returns a list of devices having type in supported_device_types.  The
+// returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
     const DeviceTypeVector& supported_device_types) {
@@ -81,12 +79,12 @@ std::vector<Device*> FilterSupportedDevices(
 //   DeviceSet device_set = ...;
 //   ColocationGraph colocation_graph(graph, device_set);
 //
-//   // Add all the nodes of graph to colocation_graph.
+//   // Add all the nodes of the `graph` to the `colocation_graph`.
 //   for (Node* node : graph.nodes()) {
 //     TF_RETURN_IF_ERROR(colocation_graph.AddNode(*node));
 //   }
 //
-//   // Add one or more colocation constraint.
+//   // Add one or more colocation constraints.
 //   Node node_1 = *graph.FindNodeId(...);
 //   Node node_2 = *graph.FindNodeId(...);
 //   TF_RETURN_IF_ERROR(colocation_graph.ColocateNodes(node_1, node_2));
@@ -96,9 +94,9 @@ std::vector<Device*> FilterSupportedDevices(
 //     TF_RETURN_IF_ERROR(colocation_graph.AssignDevice(node));
 //   }
 //
-// The implementation uses the union-find algorithm to maintain the
-// connected components efficiently and incrementally as edges
-// (implied by ColocationGraph::ColocateNodes() invocations) are added.
+// This implementation uses the Union-Find algorithm to efficiently maintain the
+// connected components and incrementally adds edges via
+// ColocationGraph::ColocateNodes() invocations.
 class ColocationGraph {
  public:
   ColocationGraph(Graph* graph, const DeviceSet* device_set,
@@ -134,13 +132,9 @@ class ColocationGraph {
     std::unordered_map<StringPiece, const Node*, StringPieceHasher>
         colocation_group_root;
 
-    for (Node* node : graph_->nodes()) {
-      if (!node->IsOp()) {
-        continue;
-      }
-
-      // When adding the node, identify whether it is part of a
-      // colocation group.
+    for (Node* node : graph_->op_nodes()) {
+      // When adding the node, identify whether it is part of a colocation
+      // group.
 
       // This code is effectively the equivalent of GetNodeAttr() for a string
       // array, but it avoids all internal allocations (the allocation of the
@@ -219,11 +213,10 @@ class ColocationGraph {
     Member& x_root_member = members_[x_root];
     Member& y_root_member = members_[y_root];
 
-    // Merge the sets by swinging the parent pointer of the smaller
-    // tree to point to the root of the larger tree. Together with
-    // path compression in ColocationGraph::FindRoot, this ensures
-    // that we do not experience pathological performance on graphs
-    // such as chains.
+    // Merge the sets by setting the parent pointer of the smaller tree's root
+    // node to point to the root of the larger tree. Together with path
+    // compression in ColocationGraph::FindRoot, this ensures that we do not
+    // experience pathological performance on graphs such as chains.
     int new_root, old_root;
     if (x_root_member.rank < y_root_member.rank) {
       // The tree rooted at x_root is shallower, so connect it to
@@ -611,22 +604,16 @@ class ColocationGraph {
   // given id is connected.
   int FindRoot(int node_id) {
     Member& member = members_[node_id];
-
-    int parent = member.parent;
-    DCHECK_GE(parent, 0);
-
-    if (parent != node_id) {
-      // NOTE: Compress paths from node_id to its root, so that future
-      // calls to FindRoot and ColocateNodes are more efficient.
-      int root = FindRoot(parent);
-      if (parent != root) {
-        parent = root;
-        member.parent = root;
-      }
+    DCHECK_GE(member.parent, 0);
+    if (member.parent == node_id) {
+      // member.parent is the root of this disjoint tree.  Do nothing.
+    } else {
+      member.parent = FindRoot(member.parent);
     }
-
-    DCHECK_GE(parent, 0);
-    return parent;
+    // Now it is guaranteed that member.parent is the root of this disjoint
+    // tree.
+    DCHECK_GE(member.parent, 0);
+    return member.parent;
   }
 
   // Ensures that the devices of 'dst's resource and reference match the device
diff --git a/tensorflow/core/common_runtime/session_ref.cc b/tensorflow/core/common_runtime/session_ref.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b931ef422978de117f9de68e9e26e5e928bf7ae3
--- /dev/null
+++ b/tensorflow/core/common_runtime/session_ref.cc
@@ -0,0 +1,170 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/session_ref.h"
+
+#include <utility>
+
+namespace tensorflow {
+
+namespace {
+
+// Scope helper to track active calls and manage session lifetime.
+struct RunCounter {
+  std::shared_ptr<Session> session;
+  uint64* value;
+  mutex* m;
+  condition_variable* cv;
+
+  explicit RunCounter(std::shared_ptr<Session> s, uint64* v, mutex* m,
+                      condition_variable* cv)
+      : session(std::move(s)), value(v), m(m), cv(cv) {
+    mutex_lock l(*m);
+    ++*value;
+  }
+
+  ~RunCounter() {
+    mutex_lock l(*m);
+    if (--*value == 0) {
+      cv->notify_all();
+    }
+  }
+};
+
+}  // namespace
+
+Status SessionRef::CheckNotClosed() {
+  mutex_lock l(run_lock_);
+  if (session_ == nullptr) return errors::Cancelled("Session has been closed.");
+  return ::tensorflow::Status::OK();
+}
+
+Status SessionRef::Run(const RunOptions& run_options,
+                       const std::vector<std::pair<string, Tensor> >& inputs,
+                       const std::vector<string>& output_tensor_names,
+                       const std::vector<string>& target_node_names,
+                       std::vector<Tensor>* outputs,
+                       RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Run(run_options, inputs, output_tensor_names,
+                         target_node_names, outputs, run_metadata);
+}
+
+Status SessionRef::Create(const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Create(graph);
+}
+
+Status SessionRef::Create(const RunOptions& run_options,
+                          const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Create(run_options, graph);
+}
+
+Status SessionRef::Extend(const RunOptions& run_options,
+                          const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Extend(run_options, graph);
+}
+
+Status SessionRef::Extend(const GraphDef& graph) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Extend(graph);
+}
+
+Status SessionRef::Close(const RunOptions& run_options) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  mutex_lock l(run_lock_);
+  Status status = session_->Close(run_options);
+  session_.reset();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
+  }
+  return status;
+}
+
+Status SessionRef::Close() {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  mutex_lock l(run_lock_);
+  Status status = session_->Close();
+  session_.reset();
+  while (run_count_ > 0) {
+    run_finished_.wait(l);
+  }
+  return status;
+}
+
+Status SessionRef::Run(const std::vector<std::pair<string, Tensor> >& inputs,
+                       const std::vector<string>& output_tensor_names,
+                       const std::vector<string>& target_node_names,
+                       std::vector<Tensor>* outputs) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->Run(inputs, output_tensor_names, target_node_names,
+                         outputs);
+}
+
+Status SessionRef::ListDevices(std::vector<DeviceAttributes>* response) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->ListDevices(response);
+}
+
+Status SessionRef::PRunSetup(const std::vector<string>& input_names,
+                             const std::vector<string>& output_names,
+                             const std::vector<string>& target_nodes,
+                             string* handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->PRunSetup(input_names, output_names, target_nodes, handle);
+}
+
+Status SessionRef::PRun(const string& handle,
+                        const std::vector<std::pair<string, Tensor> >& inputs,
+                        const std::vector<string>& output_names,
+                        std::vector<Tensor>* outputs) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->PRun(handle, inputs, output_names, outputs);
+}
+
+Status SessionRef::MakeCallable(const CallableOptions& callable_options,
+                                CallableHandle* out_handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->MakeCallable(callable_options, out_handle);
+}
+
+Status SessionRef::RunCallable(CallableHandle handle,
+                               const std::vector<Tensor>& feed_tensors,
+                               std::vector<Tensor>* fetch_tensors,
+                               RunMetadata* run_metadata) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->RunCallable(handle, feed_tensors, fetch_tensors,
+                                 run_metadata);
+}
+
+Status SessionRef::ReleaseCallable(CallableHandle handle) {
+  TF_RETURN_IF_ERROR(CheckNotClosed());
+  RunCounter rc(session_, &run_count_, &run_lock_, &run_finished_);
+  return rc.session->ReleaseCallable(handle);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/session_ref.h b/tensorflow/core/common_runtime/session_ref.h
new file mode 100644
index 0000000000000000000000000000000000000000..6146933326fafcf32ac234ae26b91cbcf01d9d08
--- /dev/null
+++ b/tensorflow/core/common_runtime/session_ref.h
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
+
+#include <memory>
+
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// A `SessionRef` manages the lifetime of a wrapped `Session` pointer.
+//
+// SessionRef blocks the return of Close() until all pending operations have
+// been completed or cancelled and underlying session has been freed.  Any
+// subsequent operations on the SessionRef object will return errors::Cancelled.
+class SessionRef : public Session {
+ public:
+  SessionRef(Session* session) : session_(session) {}
+  virtual ~SessionRef() {}
+
+  Status Create(const GraphDef& graph) override;
+  Status Extend(const GraphDef& graph) override;
+  Status Create(const RunOptions& run_options, const GraphDef& graph) override;
+  Status Extend(const RunOptions& run_options, const GraphDef& graph) override;
+  Status Run(const std::vector<std::pair<string, Tensor> >& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs) override;
+
+  Status ListDevices(std::vector<DeviceAttributes>* response) override;
+
+  Status Close() override;
+  Status Close(const RunOptions& run_options) override;
+
+  Status Run(const RunOptions& run_options,
+             const std::vector<std::pair<string, Tensor> >& inputs,
+             const std::vector<string>& output_tensor_names,
+             const std::vector<string>& target_node_names,
+             std::vector<Tensor>* outputs, RunMetadata* run_metadata) override;
+
+  Status PRunSetup(const std::vector<string>& input_names,
+                   const std::vector<string>& output_names,
+                   const std::vector<string>& target_nodes,
+                   string* handle) override;
+
+  Status PRun(const string& handle,
+              const std::vector<std::pair<string, Tensor> >& inputs,
+              const std::vector<string>& output_names,
+              std::vector<Tensor>* outputs) override;
+
+  Status MakeCallable(const CallableOptions& callable_options,
+                      CallableHandle* out_handle);
+
+  Status RunCallable(CallableHandle handle,
+                     const std::vector<Tensor>& feed_tensors,
+                     std::vector<Tensor>* fetch_tensors,
+                     RunMetadata* run_metadata);
+
+  Status ReleaseCallable(CallableHandle handle);
+
+ private:
+  mutex run_lock_;
+  condition_variable run_finished_;
+  uint64 run_count_ GUARDED_BY(run_lock_) = {0};
+  std::shared_ptr<Session> session_;
+
+  Status CheckNotClosed();
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SESSION_REF_H_
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 74a87215e1917db4c149da320a4d53c6ea6a25be..7406ecf4f82119d3e9898af53823c1fccec83765 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -111,7 +111,21 @@ Status ThreadPoolDevice::MakeTensorFromProto(
 }
 
 #ifdef INTEL_MKL
-REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator);
+namespace {
+class MklCPUAllocatorFactory : public AllocatorFactory {
+ public:
+  bool NumaEnabled() override { return false; }
+
+  Allocator* CreateAllocator() override { return new MklCPUAllocator; }
+
+  // Note: Ignores numa_node, for now.
+  virtual SubAllocator* CreateSubAllocator(int numa_node) {
+    return new MklSubAllocator;
+  }
+};
+
+REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocatorFactory);
+}  // namespace
 #endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 9ba8c8d80cb0db9f2d49b4162f82160bb345b009..707f3234b97a6d7a1348531b6bfc985530308c5e 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -39,6 +39,7 @@ class EagerClient {
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
   CLIENT_METHOD(RegisterFunction);
+  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 };
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 466e779fab7c2c984a9bae84a82a48d4a3e01e90..916c8720f0f5272def176223d666ad49abd50a35 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -81,10 +81,11 @@ Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
 
 Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                                        CreateContextResponse* response) {
-  //make sure env_ , env_->rendezvous_mgr available
+  // make sure env_ , env_->rendezvous_mgr available
   if (env_ == nullptr || env_->rendezvous_mgr == nullptr) {
-    return tensorflow::errors::Internal("invalid eager env_ or env_->rendezvous_mgr.");
-  } 
+    return tensorflow::errors::Internal(
+        "invalid eager env_ or env_->rendezvous_mgr.");
+  }
   std::vector<tensorflow::Device*> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
@@ -266,6 +267,35 @@ Status EagerServiceImpl::RegisterFunction(
   return context->Context()->AddFunctionDef(request->function_def());
 }
 
+Status EagerServiceImpl::SendTensor(const SendTensorRequest* request,
+                                    SendTensorResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> tensors;
+  for (const auto& tensor_proto : request->tensors()) {
+    Tensor tensor;
+    if (!tensor.FromProto(tensor_proto)) {
+      return errors::InvalidArgument("Unable to parse tensor proto");
+    }
+
+    TensorHandle* tensor_handle =
+        new TensorHandle(tensor, nullptr, nullptr, nullptr);
+
+    TensorHandle* copied_handle = nullptr;
+    TF_RETURN_IF_ERROR(EagerCopyToDevice(tensor_handle, context->Context(),
+                                         request->device_name().c_str(),
+                                         &copied_handle));
+    tensors.push_back(copied_handle);
+    tensor_handle->Unref();
+  }
+
+  context->AddOperationOutputs(tensors, request->op_id());
+
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   mutex_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index b0e4aa84b9e9b84f54644c673c3bcbe4bcd55395..718b4e2457b63caff47996274fc00b99a3c551c5 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -62,6 +62,9 @@ class EagerServiceImpl {
   Status RegisterFunction(const RegisterFunctionRequest* request,
                           RegisterFunctionResponse* response);
 
+  Status SendTensor(const SendTensorRequest* request,
+                    SendTensorResponse* response);
+
  protected:
   // This is the server-side execution context. All state regarding execution of
   // a client's ops is held in this server-side context (all generated tensors,
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index b98386ba86d8828cf2596c60afa1c7bca4fd497e..d1f2a6da8fe47eb32fc4c3c23ff36018e6136cdd 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -84,7 +84,7 @@ class EagerServiceImplTest : public ::testing::Test {
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
-void SetTensorProto(AttrValue* val) {
+void SetTensorProto(TensorProto* tensor_proto) {
   int64_t dims[] = {2, 2};
   float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
   TF_Tensor* t = TF_AllocateTensor(
@@ -92,7 +92,7 @@ void SetTensorProto(AttrValue* val) {
   memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
   tensorflow::Tensor tensor;
   TF_ASSERT_OK(tensorflow::TF_TensorToTensor(t, &tensor));
-  tensor.AsProtoTensorContent(val->mutable_tensor());
+  tensor.AsProtoTensorContent(tensor_proto);
   TF_DeleteTensor(t);
 }
 
@@ -175,7 +175,7 @@ TEST_F(EagerServiceImplTest, BasicTest) {
   val.set_type(tensorflow::DataType::DT_FLOAT);
   const_attrs.insert({"dtype", val});
   val.Clear();
-  SetTensorProto(&val);
+  SetTensorProto(val.mutable_tensor());
   const_attrs.insert({"value", val});
 
   AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
@@ -260,7 +260,7 @@ TEST_F(EagerServiceImplTest, BasicFunctionTest) {
   const_attrs.insert({"dtype", val});
   val.Clear();
 
-  SetTensorProto(&val);
+  SetTensorProto(val.mutable_tensor());
   const_attrs.insert({"value", val});
 
   AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
@@ -294,6 +294,77 @@ TEST_F(EagerServiceImplTest, BasicFunctionTest) {
                                                &close_context_response));
 }
 
+// Test creates a context and attempts to send a tensor (using the RPC), and
+// then use the tensor.
+TEST_F(EagerServiceImplTest, SendTensorTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  request.set_rendezvous_id(random::New64());
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  SendTensorRequest send_tensor_request;
+  send_tensor_request.set_context_id(context_id);
+  send_tensor_request.set_op_id(1);
+  SetTensorProto(send_tensor_request.add_tensors());
+  SendTensorResponse send_tensor_response;
+
+  TF_ASSERT_OK(eager_service_impl.SendTensor(&send_tensor_request,
+                                             &send_tensor_response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> attrs;
+  AttrValue val;
+  val.Clear();
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  attrs.insert({"T", val});
+  val.Clear();
+  val.set_b(false);
+  attrs.insert({"transpose_a", val});
+  attrs.insert({"transpose_b", val});
+
+  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  const tensorflow::Tensor* t = nullptr;
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  Device* device = nullptr;
+  TF_ASSERT_OK(tensor_handle->Device(&device));
+  EXPECT_NE(device, nullptr);
+  EXPECT_EQ(device->name(), "/job:localhost/replica:0/task:0/device:CPU:0");
+
+  auto actual = t->flat<float>();
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 }  // namespace
 }  // namespace eager
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
index 28b68c3b884d403b433359bf46616e85c92d1ccf..0e3a68c4d80c85addae71f579bff753c55c31f5e 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -29,8 +29,8 @@ namespace eager {
 class RemoteExecuteNode : public tensorflow::EagerNode {
  public:
   RemoteExecuteNode(
-      tensorflow::uint64 id, const tensorflow::eager::EnqueueRequest& request,
-      tensorflow::eager::EagerClient* eager_client,
+      tensorflow::uint64 id, std::unique_ptr<EnqueueRequest> request,
+      EagerClient* eager_client,
       const gtl::InlinedVector<TensorHandle*, 4>& inputs,
       std::function<void(const Status& status, const EnqueueResponse& response)>
           done_callback)
@@ -45,8 +45,8 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
   RemoteExecuteNode(tensorflow::uint64 id,
-                    const tensorflow::eager::EnqueueRequest& request,
-                    tensorflow::eager::EagerClient* eager_client)
+                    std::unique_ptr<EnqueueRequest> request,
+                    EagerClient* eager_client)
       : tensorflow::EagerNode(id),
         request_(std::move(request)),
         eager_client_(eager_client) {}
@@ -58,10 +58,10 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
   tensorflow::Status Run() override {
-    tensorflow::eager::EnqueueResponse response;
-    tensorflow::Status status;
+    EnqueueResponse response;
+    Status status;
     Notification n;
-    eager_client_->EnqueueAsync(&request_, &response,
+    eager_client_->EnqueueAsync(request_.get(), &response,
                                 [&n, &status](const tensorflow::Status& s) {
                                   status.Update(s);
                                   n.Notify();
@@ -76,9 +76,8 @@ class RemoteExecuteNode : public tensorflow::EagerNode {
   }
 
  private:
-  EnqueueRequest request_;
-  tensorflow::eager::EagerClient*
-      eager_client_;  // Not owned, and must outlive the RemoteExecuteNode.
+  std::unique_ptr<EnqueueRequest> request_;
+  EagerClient* eager_client_;  // Not owned, and must outlive this node.
 
   // This is required to ensure that the tensor handles stay alive across the
   // execution.
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index b23466037f46200109a0c42be66ed0a8e32ef1e7..181422118cd9f01658c1601a1779355f127c6fac 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -49,6 +49,7 @@ class GrpcEagerClient : public EagerClient {
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
   CLIENT_METHOD(RegisterFunction);
+  CLIENT_METHOD(SendTensor);
 
 #undef CLIENT_METHOD
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
index 39ab6856c53e86dbe4263e0054111fe3a60ebfaf..ab3aa3fd1de291a1318ccf1f675bcb4e87a66fb1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
@@ -36,6 +36,7 @@ static const char* grpcEagerService_method_names[] = {
     "/tensorflow.eager.EagerService/KeepAlive",
     "/tensorflow.eager.EagerService/CloseContext",
     "/tensorflow.eager.EagerService/RegisterFunction",
+    "/tensorflow.eager.EagerService/SendTensor",
 };
 
 std::unique_ptr<EagerService::Stub> EagerService::NewStub(
@@ -62,7 +63,9 @@ EagerService::Stub::Stub(
                               ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
       rpcmethod_RegisterFunction_(grpcEagerService_method_names[5],
                                   ::grpc::internal::RpcMethod::NORMAL_RPC,
-                                  channel) {}
+                                  channel),
+      rpcmethod_SendTensor_(grpcEagerService_method_names[6],
+                            ::grpc::internal::RpcMethod::NORMAL_RPC, channel) {}
 
 ::grpc::Status EagerService::Stub::CreateContext(
     ::grpc::ClientContext* context, const CreateContextRequest& request,
@@ -106,8 +109,15 @@ EagerService::Stub::Stub(
       channel_.get(), rpcmethod_RegisterFunction_, context, request, response);
 }
 
+::grpc::Status EagerService::Stub::SendTensor(::grpc::ClientContext* context,
+                                              const SendTensorRequest& request,
+                                              SendTensorResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_SendTensor_, context, request, response);
+}
+
 EagerService::AsyncService::AsyncService() {
-  for (int i = 0; i < 6; ++i) {
+  for (int i = 0; i < 7; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcEagerService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
index 66458186ada4bba1433b1f95b0a3ac7ea699117a..521e0ac4fabc5a28e210ed68bfde0bda81fce737 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -69,6 +69,9 @@ class EagerService final {
     virtual ::grpc::Status RegisterFunction(
         ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
         RegisterFunctionResponse* response) = 0;
+    virtual ::grpc::Status SendTensor(::grpc::ClientContext* context,
+                                      const SendTensorRequest& request,
+                                      SendTensorResponse* response) = 0;
   };
   class Stub final : public StubInterface {
    public:
@@ -91,6 +94,9 @@ class EagerService final {
     ::grpc::Status RegisterFunction(
         ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
         RegisterFunctionResponse* response) override;
+    ::grpc::Status SendTensor(::grpc::ClientContext* context,
+                              const SendTensorRequest& request,
+                              SendTensorResponse* response) override;
 
    private:
     std::shared_ptr< ::grpc::ChannelInterface> channel_;
@@ -100,6 +106,7 @@ class EagerService final {
     const ::grpc::internal::RpcMethod rpcmethod_KeepAlive_;
     const ::grpc::internal::RpcMethod rpcmethod_CloseContext_;
     const ::grpc::internal::RpcMethod rpcmethod_RegisterFunction_;
+    const ::grpc::internal::RpcMethod rpcmethod_SendTensor_;
   };
   static std::unique_ptr<Stub> NewStub(
       const std::shared_ptr< ::grpc::ChannelInterface>& channel,
@@ -157,6 +164,14 @@ class EagerService final {
       ::grpc::Service::RequestAsyncUnary(5, context, request, response,
                                          new_call_cq, notification_cq, tag);
     }
+    void RequestSendTensor(
+        ::grpc::ServerContext* context, SendTensorRequest* request,
+        ::grpc::ServerAsyncResponseWriter<SendTensorResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(6, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
   };
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
index 52e06c263d8e78b03ab1b2c07a37c22a7fc5686e..f511674e1f0584fd296d0c5ae050cf504f38366e 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -27,9 +27,7 @@ namespace eager {
 
 GrpcEagerServiceImpl::GrpcEagerServiceImpl(
     const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
-    : local_impl_(env) {
-  request_handler_threadpool_ =
-      MakeUnique<thread::ThreadPool>(env->env, "EagerServiceRequestHandler", 4);
+    : env_(env), local_impl_(env) {
   server_builder->RegisterService(&service_);
   cq_ = server_builder->AddCompletionQueue();
 }
@@ -50,6 +48,7 @@ void GrpcEagerServiceImpl::HandleRPCsLoop() {
   ENQUEUE_REQUEST(KeepAlive);
   ENQUEUE_REQUEST(CloseContext);
   ENQUEUE_REQUEST(RegisterFunction);
+  ENQUEUE_REQUEST(SendTensor);
 #undef ENQUEUE_REQUEST
 
   void* tag;  // Matches the operation started against this cq_.
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 9a9402634255abe515fc48c628ac887a1ba08856..537e9043bdcd9cdda42548fd71d79ddd22dcc8dc 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -45,7 +45,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
  private:
 #define HANDLER(method)                                                        \
   void method##Handler(EagerCall<method##Request, method##Response>* call) {   \
-    request_handler_threadpool_->Schedule([this, call]() {                     \
+    env_->compute_pool->Schedule([this, call]() {                              \
       call->SendResponse(                                                      \
           ToGrpcStatus(local_impl_.method(&call->request, &call->response)));  \
     });                                                                        \
@@ -62,8 +62,10 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   HANDLER(KeepAlive);
   HANDLER(CloseContext);
   HANDLER(RegisterFunction);
+  HANDLER(SendTensor);
 #undef HANDLER
 
+  const WorkerEnv* const env_;  // Not owned.
   EagerServiceImpl local_impl_;
 
   std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
@@ -71,8 +73,6 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
   std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
   tensorflow::eager::grpc::EagerService::AsyncService service_;
 
-  std::unique_ptr<thread::ThreadPool> request_handler_threadpool_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index db14f6473e459483ad9054d56865930d89363ed3..8a6903be9eb616f322e0034b21e950f75606b2b7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -244,6 +244,7 @@ Status GrpcServer::Init(
   // Finish setting up master environment.
   master_env_.ops = OpRegistry::Global();
   master_env_.worker_cache = worker_cache;
+  master_env_.collective_executor_mgr = worker_env_.collective_executor_mgr;
   master_env_.master_session_factory =
       [config, stats_factory](
           SessionOptions options, const MasterEnv* env,
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 1c62d37955b3135eb691460ee57e08eb9524cab5..888ed0c57b04463367f5d1b9699c36e2f405a43e 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -91,6 +91,11 @@ void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
 
+namespace {
+// A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
+// return a different version that may perform better, but may also lack the
+// optional stats triggered by the functions above.  TODO(tucker): migrate all
+// uses of cpu_allocator() except tests to use ProcessState instead.
 class CPUAllocator : public Allocator {
  public:
   CPUAllocator()
@@ -170,14 +175,42 @@ class CPUAllocator : public Allocator {
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
 
+class CPUAllocatorFactory : public AllocatorFactory {
+ public:
+  Allocator* CreateAllocator() override { return new CPUAllocator; }
+
+  SubAllocator* CreateSubAllocator(int numa_node) override {
+    return new CPUSubAllocator(new CPUAllocator);
+  }
+
+ private:
+  class CPUSubAllocator : public SubAllocator {
+   public:
+    explicit CPUSubAllocator(CPUAllocator* cpu_allocator)
+        : cpu_allocator_(cpu_allocator) {}
+
+    void* Alloc(size_t alignment, size_t num_bytes) override {
+      return cpu_allocator_->AllocateRaw(alignment, num_bytes);
+    }
+
+    void Free(void* ptr, size_t num_bytes) override {
+      cpu_allocator_->DeallocateRaw(ptr);
+    }
+
+   private:
+    CPUAllocator* cpu_allocator_;
+  };
+};
+
+REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocatorFactory);
+}  // namespace
+
 Allocator* cpu_allocator() {
-  static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator();
+  static Allocator* cpu_alloc =
+      AllocatorFactoryRegistry::singleton()->GetAllocator();
   if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) {
     cpu_alloc = new TrackingAllocator(cpu_alloc, true);
   }
   return cpu_alloc;
 }
-
-REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator);
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2bb4d32d5776640b505e0132f7f1fd263480e722..774b1fe1379fc0e32432c299b04b9894f7d244c3 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -376,16 +376,18 @@ struct AllocatorAttributes {
   int32 scope_id = 0;
 };
 
-// Returns a trivial implementation of Allocator which uses the system
-// default malloc. The returned allocator is a process singleton.
+// Returns a trivial implementation of Allocator, which is a process singleton.
+// Access through this function is only intended for use in tests and auxiliary
+// processing.  Performance sensitive uses should always obtain allocators from
+// ProcessState.
 Allocator* cpu_allocator();
 
-// If 'enable' is true, the process-wide cpu allocator collects
+// If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
 
-// If 'enable' is true, the process-wide cpu allocator collects full
-// statistics. By default, it's disabled.
+// If 'enable' is true, the default CPU allocator implementation will collect
+// full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
 
 // Abstract interface of an object that does the underlying suballoc/free of
diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc
index 486be39ae31c487560efebc79e0fbab90ddca9db..099c4bacc8615d9c7f901a7d725d60e5de1ea676 100644
--- a/tensorflow/core/framework/allocator_registry.cc
+++ b/tensorflow/core/framework/allocator_registry.cc
@@ -21,60 +21,110 @@ limitations under the License.
 namespace tensorflow {
 
 // static
-AllocatorRegistry* AllocatorRegistry::Global() {
-  static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry;
-  return global_allocator_registry;
+AllocatorFactoryRegistry* AllocatorFactoryRegistry::singleton() {
+  static AllocatorFactoryRegistry* singleton = new AllocatorFactoryRegistry;
+  return singleton;
 }
 
-Allocator* AllocatorRegistry::GetRegisteredAllocator(const string& name,
-                                                     int priority) {
-  for (auto entry : allocators_) {
+const AllocatorFactoryRegistry::FactoryEntry*
+AllocatorFactoryRegistry::FindEntry(const string& name, int priority) const {
+  for (auto& entry : factories_) {
     if (!name.compare(entry.name) && priority == entry.priority) {
-      return entry.allocator;
+      return &entry;
     }
   }
   return nullptr;
 }
 
-void AllocatorRegistry::Register(const string& name, int priority,
-                                 Allocator* allocator) {
+void AllocatorFactoryRegistry::Register(const char* source_file,
+                                        int source_line, const string& name,
+                                        int priority,
+                                        AllocatorFactory* factory) {
+  mutex_lock l(mu_);
+  CHECK(!first_alloc_made_) << "Attempt to register an AllocatorFactory "
+                            << "after call to GetAllocator()";
   CHECK(!name.empty()) << "Need a valid name for Allocator";
   CHECK_GE(priority, 0) << "Priority needs to be non-negative";
 
-  Allocator* existing = GetRegisteredAllocator(name, priority);
+  const FactoryEntry* existing = FindEntry(name, priority);
   if (existing != nullptr) {
-    // A duplicate is if the registration name and priority match
-    // but the Allocator::Name()'s don't match.
-    CHECK_EQ(existing->Name(), allocator->Name())
-        << "Allocator with name: [" << name << "], type [" << existing->Name()
-        << "], priority: [" << priority
-        << "] already registered.  Choose a different name to register "
-        << "an allocator of type " << allocator->Name();
-
-    // The allocator names match, so we can just return.
-    // It should be safe to delete the allocator since the caller
-    // gives up ownership of it.
-    delete allocator;
-    return;
+    // Duplicate registration is a hard failure.
+    LOG(FATAL) << "New registration for AllocatorFactory with name=" << name
+               << " priority=" << priority << " at location " << source_file
+               << ":" << source_line
+               << " conflicts with previous registration at location "
+               << existing->source_file << ":" << existing->source_line;
   }
 
-  AllocatorRegistryEntry tmp_entry;
-  tmp_entry.name = name;
-  tmp_entry.priority = priority;
-  tmp_entry.allocator = allocator;
+  FactoryEntry entry;
+  entry.source_file = source_file;
+  entry.source_line = source_line;
+  entry.name = name;
+  entry.priority = priority;
+  entry.factory.reset(factory);
+  factories_.push_back(std::move(entry));
+}
 
-  allocators_.push_back(tmp_entry);
-  int high_pri = -1;
-  for (auto entry : allocators_) {
-    if (high_pri < entry.priority) {
-      m_curr_allocator_ = entry.allocator;
-      high_pri = entry.priority;
+Allocator* AllocatorFactoryRegistry::GetAllocator() {
+  mutex_lock l(mu_);
+  first_alloc_made_ = true;
+  FactoryEntry* best_entry = nullptr;
+  for (auto& entry : factories_) {
+    if (best_entry == nullptr) {
+      best_entry = &entry;
+    } else if (entry.priority > best_entry->priority) {
+      best_entry = &entry;
     }
   }
+  if (best_entry) {
+    if (!best_entry->allocator) {
+      best_entry->allocator.reset(best_entry->factory->CreateAllocator());
+    }
+    return best_entry->allocator.get();
+  } else {
+    LOG(FATAL) << "No registered CPU AllocatorFactory";
+    return nullptr;
+  }
 }
 
-Allocator* AllocatorRegistry::GetAllocator() {
-  return CHECK_NOTNULL(m_curr_allocator_);
+SubAllocator* AllocatorFactoryRegistry::GetSubAllocator(int numa_node) {
+  mutex_lock l(mu_);
+  first_alloc_made_ = true;
+  FactoryEntry* best_entry = nullptr;
+  for (auto& entry : factories_) {
+    if (best_entry == nullptr) {
+      best_entry = &entry;
+    } else if (best_entry->factory->NumaEnabled()) {
+      if (entry.factory->NumaEnabled() &&
+          (entry.priority > best_entry->priority)) {
+        best_entry = &entry;
+      }
+    } else {
+      DCHECK(!best_entry->factory->NumaEnabled());
+      if (entry.factory->NumaEnabled() ||
+          (entry.priority > best_entry->priority)) {
+        best_entry = &entry;
+      }
+    }
+  }
+  if (best_entry) {
+    int index = 0;
+    if (numa_node != port::kNUMANoAffinity) {
+      CHECK_LE(numa_node, port::NUMANumNodes());
+      index = 1 + numa_node;
+    }
+    if (best_entry->sub_allocators.size() < (index + 1)) {
+      best_entry->sub_allocators.resize(index + 1);
+    }
+    if (!best_entry->sub_allocators[index].get()) {
+      best_entry->sub_allocators[index].reset(
+          best_entry->factory->CreateSubAllocator(numa_node));
+    }
+    return best_entry->sub_allocators[index].get();
+  } else {
+    LOG(FATAL) << "No registered CPU AllocatorFactory";
+    return nullptr;
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h
index b26e79ac3b01c7b3fe5099f626c4e35862586282..24f282ce84c61ffcc00499fd11cd1d5fd0ade436 100644
--- a/tensorflow/core/framework/allocator_registry.h
+++ b/tensorflow/core/framework/allocator_registry.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Classes to maintain a static registry of memory allocators
+// Classes to maintain a static registry of memory allocator factories.
 #ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
 #define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_
 
@@ -21,59 +21,100 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 
 namespace tensorflow {
 
-// A global AllocatorRegistry is used to hold allocators for CPU backends
-class AllocatorRegistry {
+class AllocatorFactory {
  public:
-  // Add an allocator to the registry.  Caller releases ownership of
-  // 'allocator'.
-  void Register(const string& name, int priority, Allocator* allocator);
+  virtual ~AllocatorFactory() {}
 
-  // Return allocator with highest priority
-  // If multiple allocators have the same high priority, return one of them
+  // Returns true if the factory will create a functionally different
+  // SubAllocator for different (legal) values of numa_node.
+  virtual bool NumaEnabled() { return false; }
+
+  // Create an Allocator.
+  virtual Allocator* CreateAllocator() = 0;
+
+  // Create a SubAllocator. If NumaEnabled() is true, then returned SubAllocator
+  // will allocate memory local to numa_node.  If numa_node == kNUMANoAffinity
+  // then allocated memory is not specific to any NUMA node.
+  virtual SubAllocator* CreateSubAllocator(int numa_node) = 0;
+};
+
+// A singleton registry of AllocatorFactories.
+//
+// Allocators should be obtained through ProcessState or cpu_allocator()
+// (deprecated), not directly through this interface.  The purpose of this
+// registry is to allow link-time discovery of multiple AllocatorFactories among
+// which ProcessState will obtain the best fit at startup.
+class AllocatorFactoryRegistry {
+ public:
+  AllocatorFactoryRegistry() {}
+  ~AllocatorFactoryRegistry() {}
+
+  void Register(const char* source_file, int source_line, const string& name,
+                int priority, AllocatorFactory* factory);
+
+  // Returns 'best fit' Allocator.  Find the factory with the highest priority
+  // and return an allocator constructed by it.  If multiple factories have
+  // been registered with the same priority, picks one by unspecified criteria.
   Allocator* GetAllocator();
 
-  // Returns the global registry of allocators.
-  static AllocatorRegistry* Global();
+  // Returns 'best fit' SubAllocator.  First look for the highest priority
+  // factory that is NUMA-enabled.  If none is registered, fall back to the
+  // highest priority non-NUMA-enabled factory.  If NUMA-enabled, return a
+  // SubAllocator specific to numa_node, otherwise return a NUMA-insensitive
+  // SubAllocator.
+  SubAllocator* GetSubAllocator(int numa_node);
+
+  // Returns the singleton value.
+  static AllocatorFactoryRegistry* singleton();
 
  private:
-  typedef struct {
+  mutex mu_;
+  bool first_alloc_made_ = false;
+  struct FactoryEntry {
+    const char* source_file;
+    int source_line;
     string name;
     int priority;
-    Allocator* allocator;  // not owned
-  } AllocatorRegistryEntry;
-
-  // Returns the Allocator registered for 'name' and 'priority',
-  // or 'nullptr' if not found.
-  Allocator* GetRegisteredAllocator(const string& name, int priority);
-
-  std::vector<AllocatorRegistryEntry> allocators_;
-  Allocator* m_curr_allocator_;  // not owned
+    std::unique_ptr<AllocatorFactory> factory;
+    std::unique_ptr<Allocator> allocator;
+    // Index 0 corresponds to kNUMANoAffinity, other indices are (numa_node +
+    // 1).
+    std::vector<std::unique_ptr<SubAllocator>> sub_allocators;
+  };
+  std::vector<FactoryEntry> factories_ GUARDED_BY(mu_);
+
+  // Returns any FactoryEntry registered under 'name' and 'priority',
+  // or 'nullptr' if none found.
+  const FactoryEntry* FindEntry(const string& name, int priority) const
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(AllocatorFactoryRegistry);
 };
 
-namespace allocator_registration {
-
-class AllocatorRegistration {
+class AllocatorFactoryRegistration {
  public:
-  AllocatorRegistration(const string& name, int priority,
-                        Allocator* allocator) {
-    AllocatorRegistry::Global()->Register(name, priority, allocator);
+  AllocatorFactoryRegistration(const char* file, int line, const string& name,
+                               int priority, AllocatorFactory* factory) {
+    AllocatorFactoryRegistry::singleton()->Register(file, line, name, priority,
+                                                    factory);
   }
 };
 
-}  // namespace allocator_registration
-
-#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \
-  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator)
+#define REGISTER_MEM_ALLOCATOR(name, priority, factory)                     \
+  REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, __FILE__, __LINE__, name, \
+                                     priority, factory)
 
-#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \
-  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator)
+#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, file, line, name, priority, \
+                                           factory)                         \
+  REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory)
 
-#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \
-  static allocator_registration::AllocatorRegistration              \
-      register_allocator_##ctr(name, priority, new allocator)
+#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, file, line, name, priority, factory) \
+  static AllocatorFactoryRegistration allocator_factory_reg_##ctr(            \
+      file, line, name, priority, new factory)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/bfloat16_test.cc b/tensorflow/core/framework/bfloat16_test.cc
index 206396a25ab784e93daa227bcf79fe608f5df706..0a1b5e1975580984c8f245f0889d0cb00ef4dba6 100644
--- a/tensorflow/core/framework/bfloat16_test.cc
+++ b/tensorflow/core/framework/bfloat16_test.cc
@@ -45,7 +45,8 @@ class Bfloat16Test : public ::testing::Test,
                      public ::testing::WithParamInterface<Bfloat16TestParam> {};
 
 TEST_P(Bfloat16Test, TruncateTest) {
-  bfloat16 truncated(GetParam().input);
+  bfloat16 truncated = bfloat16::truncate_to_bfloat16((GetParam().input));
+
   if (std::isnan(GetParam().input)) {
     EXPECT_TRUE(std::isnan(float(truncated)) || std::isinf(float(truncated)));
     return;
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 922d34fac9ecf3989c3dbd21a7383a2b8cf1ff23..b184fd91e1edf9fbee0e2d4bbe2811184759d29e 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -184,9 +184,7 @@ class DeviceBase {
 
   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
 
-  const bool has_eigen_cpu_device() const {
-    return !eigen_cpu_devices_.empty();
-  }
+  bool has_eigen_cpu_device() const { return !eigen_cpu_devices_.empty(); }
 
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 88d9d65f5ad22332c8876a5566884f7831cf3997..57bcc0f513d56d75d31527463d381c2edffa8af3 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -865,12 +865,15 @@ Status FunctionCallFrame::GetRetvals(std::vector<Tensor>* rets) const {
   return Status::OK();
 }
 
-Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets) {
+Status FunctionCallFrame::ConsumeRetvals(std::vector<Tensor>* rets,
+                                         bool allow_dead_tensors) {
   rets->clear();
   rets->reserve(rets_.size());
   for (size_t i = 0; i < rets_.size(); ++i) {
     if (rets_[i].has_val) {
       rets->emplace_back(std::move(rets_[i].val));
+    } else if (allow_dead_tensors) {
+      rets->emplace_back();
     } else {
       return errors::Internal("Retval[", i, "] does not have value");
     }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 8e607b927c6dc6818fbc6cb772413692b7c4c450..5da9af7db38d4e86d09d1e2a688fb3fd6e10dfab 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -261,7 +261,10 @@ class FunctionCallFrame : public CallFrameInterface {
   // Caller methods.
   Status SetArgs(gtl::ArraySlice<Tensor> args);
   Status GetRetvals(std::vector<Tensor>* rets) const;
-  Status ConsumeRetvals(std::vector<Tensor>* rets);
+
+  // Moves the return values from the frame to rets. If allow_dead_tensors is
+  // false it will fail if any of the retvals do not have a value.
+  Status ConsumeRetvals(std::vector<Tensor>* rets, bool allow_dead_tensors);
 
   size_t num_args() const override { return arg_types_.size(); }
   size_t num_retvals() const override { return ret_types_.size(); }
@@ -510,6 +513,9 @@ class FunctionLibraryRuntime {
     // If true, we create a new IntraProcessRendezvous, else use the existing
     // one.
     bool create_rendezvous = false;
+
+    // If True, allow returning dead tensors.
+    bool allow_dead_tensors = false;
   };
   typedef std::function<void(const Status&)> DoneCallback;
   virtual void Run(const Options& opts, Handle handle,
diff --git a/tensorflow/core/framework/function_testlib.cc b/tensorflow/core/framework/function_testlib.cc
index 2b5a0fe1bb897ed2a43785637e873afcb7b3e45d..a8eecc1a63230822b2a90cdf899374046080b449 100644
--- a/tensorflow/core/framework/function_testlib.cc
+++ b/tensorflow/core/framework/function_testlib.cc
@@ -45,13 +45,12 @@ GraphDef GDef(gtl::ArraySlice<NodeDef> nodes,
 }
 
 // Helper to construct a NodeDef.
-NodeDef NDef(const string& name, const string& op,
-             gtl::ArraySlice<string> inputs,
+NodeDef NDef(StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
              gtl::ArraySlice<std::pair<string, FDH::AttrValueWrapper>> attrs,
              const string& device) {
   NodeDef n;
-  n.set_name(name);
-  n.set_op(op);
+  n.set_name(name.ToString());
+  n.set_op(op.ToString());
   for (const auto& in : inputs) n.add_input(in);
   n.set_device(device);
   for (auto na : attrs) n.mutable_attr()->insert({na.first, na.second.proto});
diff --git a/tensorflow/core/framework/function_testlib.h b/tensorflow/core/framework/function_testlib.h
index b67c5cb1ab94f9e203f99b2a5982e282c76f942c..8cf3c6a680bf1cfe23405f8d960a45c8982c56bb 100644
--- a/tensorflow/core/framework/function_testlib.h
+++ b/tensorflow/core/framework/function_testlib.h
@@ -48,7 +48,7 @@ class Attrs {
 
 // Helper to construct a NodeDef.
 NodeDef NDef(
-    const string& name, const string& op, gtl::ArraySlice<string> inputs,
+    StringPiece name, StringPiece op, gtl::ArraySlice<string> inputs,
     gtl::ArraySlice<std::pair<string, FunctionDefHelper::AttrValueWrapper>>
         attrs = {},
     const string& device = "");
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index e8ea904ebd042976a30d63e002479acf86be8a04..0bd79366eb9f1c6ce161a8a277dd755eed90457f 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -86,7 +86,8 @@ string AttrSlice::SummarizeNode() const {
 string SummarizeNode(const Node& node) { return SummarizeNodeDef(node.def()); }
 
 string SummarizeNodeDef(const NodeDef& node_def) {
-  string ret = strings::StrCat(node_def.name(), " = ", node_def.op(), "[");
+  string ret = strings::StrCat(FormatNodeDefForError(node_def), " = ",
+                               node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
   strings::StrAppend(&ret, "](");
 
@@ -101,6 +102,14 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string FormatNodeForError(const Node& node) {
+  return FormatNodeDefForError(node.def());
+}
+
+string FormatNodeDefForError(const NodeDef& node_def) {
+  return errors::FormatNodeNameForError(node_def.name());
+}
+
 const AttrValue* AttrSlice::Find(StringPiece attr_name) const {
   // Currently, the collection used for NodeDef::attr() (google::protobuf::Map)
   // requires that the keys used for lookups have type 'const string&'. Because
@@ -634,7 +643,7 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 Status AttachDef(const Status& status, const NodeDef& node_def) {
   Status ret = status;
   errors::AppendToMessage(
-      &ret, strings::StrCat(" [[Node: ", SummarizeNodeDef(node_def), "]]"));
+      &ret, strings::StrCat(" [[", SummarizeNodeDef(node_def), "]]"));
   return ret;
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 64c8b386e83c331634dc7ca72a0924ed5899cb06..c012b7c3d37d3c82b91568fa054e8aa479527d27 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -50,6 +50,12 @@ extern const char* const kColocationGroupPrefix;
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
 
+// Produces a formatted string pattern from the node which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <node_name>}}
+string FormatNodeForError(const Node& node);
+string FormatNodeDefForError(const NodeDef& node_def);
+
 typedef protobuf::Map<string, AttrValue> AttrValueMap;
 
 // Adds an attr with name <name> and value <value> to *node_def.
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 35b7b2272bef61476a88ccece71d446d882055c7..74cc59486328bea7336c658ec6c0ba7d58e2c190 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -79,7 +81,7 @@ TEST(NodeDefUtilTest, In) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = In[T=DT_FLOAT](a)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = In[T=DT_FLOAT](a)", SummarizeNodeDef(node_def));
 
   // Mismatching Op names.
   NodeDef bad = node_def;
@@ -144,7 +146,7 @@ TEST(NodeDefUtilTest, Out) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = Out[T=DT_INT32]()", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = Out[T=DT_INT32]()", SummarizeNodeDef(node_def));
 
   // Non-number type.
   NodeDef bad = node_def;
@@ -164,7 +166,7 @@ TEST(NodeDefUtilTest, Enum) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = Enum[e=\"apple\"]()", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = Enum[e=\"apple\"]()", SummarizeNodeDef(node_def));
 
   NodeDef good = node_def;
   good.clear_attr();
@@ -191,7 +193,8 @@ TEST(NodeDefUtilTest, SameIn) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = SameIn[N=2, T=DT_DOUBLE](a, b)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node n}} = SameIn[N=2, T=DT_DOUBLE](a, b)",
+            SummarizeNodeDef(node_def));
 
   // Illegal type
   NodeDef bad = ToNodeDef(R"proto(
@@ -220,7 +223,7 @@ TEST(NodeDefUtilTest, AnyIn) {
     )proto");
   ExpectSuccess(node_def, op);
 
-  EXPECT_EQ("n = AnyIn[T=[DT_INT32, DT_STRING]](a, b)",
+  EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a, b)",
             SummarizeNodeDef(node_def));
 
   const NodeDef bad = ToNodeDef(R"proto(
@@ -243,13 +246,14 @@ TEST(NodeDefUtilTest, Device) {
   const NodeDef node_def1 =
       ToNodeDef(NodeDefBuilder("d", &op_def1).Device("/cpu:17"));
   ExpectSuccess(node_def1, op_def1);
-  EXPECT_EQ("d = None[_device=\"/cpu:17\"]()", SummarizeNodeDef(node_def1));
+  EXPECT_EQ("{{node d}} = None[_device=\"/cpu:17\"]()",
+            SummarizeNodeDef(node_def1));
 
   const OpDef op_def2 = ToOpDef(OpDefBuilder("WithAttr").Attr("v: int"));
   const NodeDef node_def2 =
       ToNodeDef(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5"));
   ExpectSuccess(node_def2, op_def2);
-  EXPECT_EQ("d = WithAttr[v=7, _device=\"/cpu:5\"]()",
+  EXPECT_EQ("{{node d}} = WithAttr[v=7, _device=\"/cpu:5\"]()",
             SummarizeNodeDef(node_def2));
 }
 
@@ -284,7 +288,7 @@ TEST(NodeDefUtilTest, ValidSyntax) {
     )proto");
   ExpectValidSyntax(node_def_explicit_inputs);
 
-  EXPECT_EQ("n = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
+  EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
             SummarizeNodeDef(node_def_explicit_inputs));
 
   const NodeDef node_def_partial_shape = ToNodeDef(R"proto(
@@ -379,7 +383,7 @@ TEST(NameRangesForNodeTest, Simple) {
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 2}}}), outputs);
 
-  EXPECT_EQ("simple = Simple[](a, b)", SummarizeNodeDef(node_def));
+  EXPECT_EQ("{{node simple}} = Simple[](a, b)", SummarizeNodeDef(node_def));
 
   OpDef bad_op_def = op_def;
   bad_op_def.mutable_input_arg(0)->clear_type();
@@ -399,7 +403,7 @@ TEST(NameRangesForNodeTest, Polymorphic) {
   TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
-  EXPECT_EQ("poly = Polymorphic[T=DT_INT32](a, b)",
+  EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_INT32](a, b)",
             SummarizeNodeDef(node_def1));
 
   const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("poly", &op_def)
@@ -408,7 +412,8 @@ TEST(NameRangesForNodeTest, Polymorphic) {
   TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs));
   EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs);
-  EXPECT_EQ("poly = Polymorphic[T=DT_BOOL](a, b)", SummarizeNodeDef(node_def2));
+  EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_BOOL](a, b)",
+            SummarizeNodeDef(node_def2));
 }
 
 TEST(NameRangesForNodeTest, NRepeats) {
@@ -431,7 +436,8 @@ TEST(NameRangesForNodeTest, NRepeats) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 5}}, {"e", {5, 8}}}),
             outputs);
   EXPECT_EQ(
-      "nr = NRepeats[M=3, N=4, T=DT_FLOAT](a, a:1, a:2, a:3, b, b:1, b:2, b:3)",
+      "{{node nr}} = NRepeats[M=3, N=4, T=DT_FLOAT](a, a:1, a:2, a:3, b, b:1, "
+      "b:2, b:3)",
       SummarizeNodeDef(node_def1));
 
   const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("nr", &op_def)
@@ -442,7 +448,7 @@ TEST(NameRangesForNodeTest, NRepeats) {
   EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 4}}}), inputs);
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
             outputs);
-  EXPECT_EQ("nr = NRepeats[M=7, N=2, T=DT_DOUBLE](a, a:1, b, b:1)",
+  EXPECT_EQ("{{node nr}} = NRepeats[M=7, N=2, T=DT_DOUBLE](a, a:1, b, b:1)",
             SummarizeNodeDef(node_def2));
 
   NodeDef bad_node_def = node_def2;
@@ -471,7 +477,7 @@ TEST(NameRangesForNodeTest, TypeList) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 4}}, {"d", {4, 7}}, {"e", {7, 9}}}),
             outputs);
   EXPECT_EQ(
-      "tl = TypeList[T1=[DT_BOOL, DT_FLOAT],"
+      "{{node tl}} = TypeList[T1=[DT_BOOL, DT_FLOAT],"
       " T2=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT],"
       " T3=[DT_INT32, DT_DOUBLE, DT_STRING]](a, a:1, b, b:1, b:2, b:3)",
       SummarizeNodeDef(node_def1));
@@ -485,7 +491,8 @@ TEST(NameRangesForNodeTest, TypeList) {
   EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}),
             outputs);
   EXPECT_EQ(
-      "tl = TypeList[T1=[DT_INT32, DT_INT32, DT_INT32, DT_INT32, DT_INT32,"
+      "{{node tl}} = TypeList[T1=[DT_INT32, DT_INT32, DT_INT32, DT_INT32, "
+      "DT_INT32,"
       " DT_INT32, DT_INT32], T2=[DT_DOUBLE], T3=[DT_DOUBLE, DT_STRING]]"
       "(a, a:1, a:2, a:3, a:4, a:5, a:6, b)",
       SummarizeNodeDef(node_def2));
@@ -509,5 +516,20 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
 
+TEST(FormatNodeForErrorTest, Node) {
+  Graph g(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("enter", "NoOp").Finalize(&g, &node));
+  EXPECT_EQ("{{node enter}}", FormatNodeForError(*node));
+}
+
+TEST(FormatNodeForErrorTest, NodeDef) {
+  NodeDef node_def;
+  node_def.set_name("enter");
+  node_def.set_op("Enter");
+  AddNodeAttr("frame_name", "test_frame", &node_def);
+  EXPECT_EQ("{{node enter}}", FormatNodeDefForError(node_def));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_compatibility_test.cc b/tensorflow/core/framework/op_compatibility_test.cc
index c782480f1fa859715c46785faa22d01675c3c16e..140f2010857887b7e8cea56bc0f444fd0b315dfb 100644
--- a/tensorflow/core/framework/op_compatibility_test.cc
+++ b/tensorflow/core/framework/op_compatibility_test.cc
@@ -209,8 +209,8 @@ TEST_F(OpCompatibilityTest, Same) {
                    .Finalize(node_def()));
   ExpectSuccess(*RegisteredOpDef());
   EXPECT_EQ(
-      "same = Same[N=3, T=DT_FLOAT, TList=[DT_BOOL, DT_BOOL]](a, b, c, c:1, "
-      "c:2, d, d:1, d:2, e, e:1)",
+      "{{node same}} = Same[N=3, T=DT_FLOAT, TList=[DT_BOOL, DT_BOOL]](a, b, "
+      "c, c:1, c:2, d, d:1, d:2, e, e:1)",
       Result());
 }
 
@@ -224,7 +224,7 @@ TEST_F(OpCompatibilityTest, AddAttr) {
       OpDefBuilder("AddAttr").Output("ndef: string").Finalize(&old_op));
   TF_ASSERT_OK(NodeDefBuilder("add_attr", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_attr = AddAttr[a=42]()", Result());
+  EXPECT_EQ("{{node add_attr}} = AddAttr[a=42]()", Result());
 }
 
 // Should be able to make an attr restriction less strict.
@@ -241,7 +241,7 @@ TEST_F(OpCompatibilityTest, LessStrict) {
                    .Attr("a", "B")
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("less_strict = LessStrict[a=\"B\"]()", Result());
+  EXPECT_EQ("{{node less_strict}} = LessStrict[a=\"B\"]()", Result());
 }
 
 // Should be able to remove an attr restriction.
@@ -259,7 +259,8 @@ TEST_F(OpCompatibilityTest, RemoveRestriction) {
                    .Attr("a", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_restriction = RemoveRestriction[a=DT_INT32]()", Result());
+  EXPECT_EQ("{{node remove_restriction}} = RemoveRestriction[a=DT_INT32]()",
+            Result());
 }
 
 // Should be able to change the order of attrs.
@@ -278,7 +279,7 @@ TEST_F(OpCompatibilityTest, AttrOrder) {
                    .Attr("a", 7)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("attr_order = AttrOrder[a=7, b=true]()", Result());
+  EXPECT_EQ("{{node attr_order}} = AttrOrder[a=7, b=true]()", Result());
 }
 
 // Should be able to make an input/output polymorphic.
@@ -299,7 +300,8 @@ TEST_F(OpCompatibilityTest, TypePolymorphic) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("type_polymorphic = TypePolymorphic[T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node type_polymorphic}} = TypePolymorphic[T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a single input/output into a list.
@@ -320,7 +322,7 @@ TEST_F(OpCompatibilityTest, MakeList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeList[N=1](a)", Result());
+  EXPECT_EQ("{{node make_list}} = MakeList[N=1](a)", Result());
 }
 
 // Should be able to make a single input/output into a polymorphic list.
@@ -343,7 +345,8 @@ TEST_F(OpCompatibilityTest, MakePolyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_poly_list = MakePolyList[N=1, T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node make_poly_list}} = MakePolyList[N=1, T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a single input/output into an arbitrary list.
@@ -364,7 +367,7 @@ TEST_F(OpCompatibilityTest, MakeAnyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_any_list = MakeAnyList[T=[DT_INT32]](a)", Result());
+  EXPECT_EQ("{{node make_any_list}} = MakeAnyList[T=[DT_INT32]](a)", Result());
 }
 
 // Should be able to make a single polymorphic input/output into a list of
@@ -387,7 +390,8 @@ TEST_F(OpCompatibilityTest, PolyIntoList) {
                    .Input(FakeInput(DT_INT32))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("poly_into_list = PolyIntoList[N=1, T=DT_INT32](a)", Result());
+  EXPECT_EQ("{{node poly_into_list}} = PolyIntoList[N=1, T=DT_INT32](a)",
+            Result());
 }
 
 // Should be able to make a multiple inputs/outputs into a list with
@@ -413,7 +417,7 @@ TEST_F(OpCompatibilityTest, MakeMultipleSameList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeMultipleSameList[N=2](a, b)", Result());
+  EXPECT_EQ("{{node make_list}} = MakeMultipleSameList[N=2](a, b)", Result());
 }
 
 // Changing from int32, float -> T
@@ -437,8 +441,9 @@ TEST_F(OpCompatibilityTest, MakeMultipleAnyList) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("make_list = MakeMultipleAnyList[T=[DT_INT32, DT_FLOAT]](a, b)",
-            Result());
+  EXPECT_EQ(
+      "{{node make_list}} = MakeMultipleAnyList[T=[DT_INT32, DT_FLOAT]](a, b)",
+      Result());
 }
 
 // Should be able to change the name of an input/output.
@@ -455,7 +460,7 @@ TEST_F(OpCompatibilityTest, ChangeName) {
                    .Input(FakeInput())
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("change_name = ChangeName[](a)", Result());
+  EXPECT_EQ("{{node change_name}} = ChangeName[](a)", Result());
 }
 
 // Should be able to add an input/output of type
@@ -473,7 +478,7 @@ TEST_F(OpCompatibilityTest, AddNInts) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_n_ints", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_ints = AddNInts[N=0]()", Result());
+  EXPECT_EQ("{{node add_n_ints}} = AddNInts[N=0]()", Result());
 }
 
 // Should be able to add an input/output of type N * T
@@ -492,7 +497,7 @@ TEST_F(OpCompatibilityTest, AddNSame) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_n_same", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_same = AddNSame[N=0, T=DT_BOOL]()", Result());
+  EXPECT_EQ("{{node add_n_same}} = AddNSame[N=0, T=DT_BOOL]()", Result());
 }
 
 // Should be able to add an input/output of type N * T
@@ -517,8 +522,10 @@ TEST_F(OpCompatibilityTest, AddNSameAsExisting) {
                    .Input(FakeInput(DT_STRING))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_n_same_as_existing = AddNSameAsExisting[N=0, T=DT_STRING](a)",
-            Result());
+  EXPECT_EQ(
+      "{{node add_n_same_as_existing}} = AddNSameAsExisting[N=0, "
+      "T=DT_STRING](a)",
+      Result());
 }
 
 // Should be able to add an input/output of type T
@@ -536,7 +543,7 @@ TEST_F(OpCompatibilityTest, AddAnyList) {
   TF_ASSERT_OK(
       NodeDefBuilder("add_any_list", &old_op.op_def).Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("add_any_list = AddAnyList[T=[]]()", Result());
+  EXPECT_EQ("{{node add_any_list}} = AddAnyList[T=[]]()", Result());
 }
 
 // Should be able to allow shorter lists.
@@ -557,8 +564,10 @@ TEST_F(OpCompatibilityTest, ShorterAnyList) {
                    .Input(FakeInput(2, DT_BOOL))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("shorter_any_list = ShorterAnyList[T=[DT_BOOL, DT_BOOL]](a, a:1)",
-            Result());
+  EXPECT_EQ(
+      "{{node shorter_any_list}} = ShorterAnyList[T=[DT_BOOL, DT_BOOL]](a, "
+      "a:1)",
+      Result());
 }
 
 REGISTER_OP("ShorterSameList")
@@ -578,7 +587,8 @@ TEST_F(OpCompatibilityTest, ShorterSameList) {
                    .Input(FakeInput(2))
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("shorter_same_list = ShorterSameList[N=2](a, a:1)", Result());
+  EXPECT_EQ("{{node shorter_same_list}} = ShorterSameList[N=2](a, a:1)",
+            Result());
 }
 
 // Can remove a restriction to an attr
@@ -597,7 +607,7 @@ TEST_F(OpCompatibilityTest, AttrRemoveRestriction) {
                    .Attr("t", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_restriction = AttrRemoveRestriction[t=DT_INT32]()",
+  EXPECT_EQ("{{node remove_restriction}} = AttrRemoveRestriction[t=DT_INT32]()",
             Result());
 }
 
@@ -619,7 +629,8 @@ TEST_F(OpCompatibilityTest, AttrLessRestrictive) {
                    .Attr("t", DT_INT32)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("less_restrictive = AttrLessRestrictive[t=DT_INT32]()", Result());
+  EXPECT_EQ("{{node less_restrictive}} = AttrLessRestrictive[t=DT_INT32]()",
+            Result());
 }
 
 // Can remove a minimum from an attr.
@@ -637,7 +648,7 @@ TEST_F(OpCompatibilityTest, AttrRemoveMin) {
                    .Attr("n", 4)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("remove_min = AttrRemoveMin[n=4]()", Result());
+  EXPECT_EQ("{{node remove_min}} = AttrRemoveMin[n=4]()", Result());
 }
 
 // Can lower the minimum on an attr.
@@ -655,7 +666,7 @@ TEST_F(OpCompatibilityTest, AttrLowerMin) {
                    .Attr("n", 4)
                    .Finalize(node_def()));
   ExpectSuccess(old_op.op_def);
-  EXPECT_EQ("lower_min = AttrLowerMin[n=4]()", Result());
+  EXPECT_EQ("{{node lower_min}} = AttrLowerMin[n=4]()", Result());
 }
 
 // Can make a ref input into a non-ref input.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 58feec90f099bc3553f42d85c48e7f5f0d2b060d..b53bd8d53df0cccc6bf5a37cbbc00edc4f7dd530 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -1061,40 +1061,51 @@ Status SupportedDeviceTypesForNode(
 }
 
 void LogAllRegisteredKernels() {
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
-    const KernelDef& kernel_def(key_registration.second.def);
+  KernelList kernel_list = GetAllRegisteredKernels();
+  for (const auto& kernel_def : kernel_list.kernel()) {
     LOG(INFO) << "OpKernel ('" << ProtoShortDebugString(kernel_def) << "')";
   }
 }
 
 KernelList GetAllRegisteredKernels() {
+  return GetFilteredRegisteredKernels([](const KernelDef& k) { return true; });
+}
+
+KernelList GetFilteredRegisteredKernels(
+    const std::function<bool(const KernelDef&)>& predicate) {
   const KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
   KernelList kernel_list;
   kernel_list.mutable_kernel()->Reserve(typed_registry->size());
   for (const auto& p : *typed_registry) {
-    *kernel_list.add_kernel() = p.second.def;
+    const KernelDef& kernel_def = p.second.def;
+    if (predicate(kernel_def)) {
+      *kernel_list.add_kernel() = kernel_def;
+    }
   }
   return kernel_list;
 }
 
+KernelList GetRegisteredKernelsForOp(StringPiece op_name) {
+  auto op_pred = [op_name](const KernelDef& k) { return k.op() == op_name; };
+  return GetFilteredRegisteredKernels(op_pred);
+}
+
 string KernelsRegisteredForOp(StringPiece op_name) {
+  KernelList kernel_list = GetRegisteredKernelsForOp(op_name);
+  if (kernel_list.kernel_size() == 0) return "  <no registered kernels>\n";
   string ret;
-  for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
-    const KernelDef& kernel_def(key_registration.second.def);
-    if (kernel_def.op() == op_name) {
-      strings::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
-      if (!kernel_def.label().empty()) {
-        strings::StrAppend(&ret, "; label='", kernel_def.label(), "'");
-      }
-      for (int i = 0; i < kernel_def.constraint_size(); ++i) {
-        strings::StrAppend(
-            &ret, "; ", kernel_def.constraint(i).name(), " in ",
-            SummarizeAttrValue(kernel_def.constraint(i).allowed_values()));
-      }
-      strings::StrAppend(&ret, "\n");
+  for (const auto& kernel_def : kernel_list.kernel()) {
+    strings::StrAppend(&ret, "  device='", kernel_def.device_type(), "'");
+    if (!kernel_def.label().empty()) {
+      strings::StrAppend(&ret, "; label='", kernel_def.label(), "'");
     }
+    for (int i = 0; i < kernel_def.constraint_size(); ++i) {
+      strings::StrAppend(
+          &ret, "; ", kernel_def.constraint(i).name(), " in ",
+          SummarizeAttrValue(kernel_def.constraint(i).allowed_values()));
+    }
+    strings::StrAppend(&ret, "\n");
   }
-  if (ret.empty()) return "  <no registered kernels>\n";
   return ret;
 }
 
@@ -1277,4 +1288,10 @@ void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
   SetStatus(s);
 }
 
+void CheckNotInComputeAsync(OpKernelContext* ctx,
+                            const char* correct_macro_name) {
+  CHECK_EQ(nullptr, ctx->op_kernel().AsAsync())
+      << "Use " << correct_macro_name << " in AsyncOpKernel implementations.";
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index d9fe42fcbb6e3a89fe3aa1d04c653731b2baa11c..2b7cc867dadc306414b557cdda0f08884de9875b 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -113,6 +113,7 @@ class OpKernel {
 
   // Returns nullptr iff this op kernel is synchronous.
   virtual AsyncOpKernel* AsAsync() { return nullptr; }
+  virtual const AsyncOpKernel* AsAsync() const { return nullptr; }
 
   // Returns true iff this op kernel is considered "expensive". The
   // runtime may use this flag to optimize graph execution for example
@@ -197,6 +198,7 @@ class AsyncOpKernel : public OpKernel {
   virtual void ComputeAsync(OpKernelContext* context, DoneCallback done) = 0;
 
   AsyncOpKernel* AsAsync() final { return this; }
+  const AsyncOpKernel* AsAsync() const final { return this; }
 
   void Compute(OpKernelContext* context) final;
 
@@ -1304,6 +1306,13 @@ void LogAllRegisteredKernels();
 // Gets a list of all registered kernels.
 KernelList GetAllRegisteredKernels();
 
+// Gets a list of all registered kernels for which predicate returns true
+KernelList GetFilteredRegisteredKernels(
+    const std::function<bool(const KernelDef&)>& predicate);
+
+// Gets a list of all registered kernels for a given op
+KernelList GetRegisteredKernelsForOp(StringPiece op_name);
+
 namespace kernel_factory {
 
 class OpKernelRegistrar {
@@ -1535,21 +1544,36 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 //   ...
 // }
 
-#define OP_REQUIRES(CTX, EXP, STATUS)                  \
-  do {                                                 \
-    if (!TF_PREDICT_TRUE(EXP)) {                       \
-      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS)); \
-      return;                                          \
-    }                                                  \
+// Generate a fatal error if OP_REQUIRES or OP_REQUIRES_OK are used in
+// AsyncOpKernel implementations. If these macros are used and the condition
+// does not hold, the `done` callback will never be called and the system will
+// deadlock, so a crash failure is preferable. Since the OP_REQUIRES[_OK] macros
+// are legal to use in AsyncOpKernel constructors, we use overload resolution
+// to distinguish between OpKernelConstruction* and OpKernelContext* context
+// types.
+class XlaOpKernelContext;
+inline void CheckNotInComputeAsync(XlaOpKernelContext*, const char*) {}
+inline void CheckNotInComputeAsync(OpKernelConstruction*, const char*) {}
+void CheckNotInComputeAsync(OpKernelContext* ctx,
+                            const char* correct_macro_name);
+
+#define OP_REQUIRES(CTX, EXP, STATUS)                     \
+  do {                                                    \
+    if (!TF_PREDICT_TRUE(EXP)) {                          \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_ASYNC"); \
+      (CTX)->CtxFailure(__FILE__, __LINE__, (STATUS));    \
+      return;                                             \
+    }                                                     \
   } while (0)
 
-#define OP_REQUIRES_OK(CTX, ...)                            \
-  do {                                                      \
-    ::tensorflow::Status _s(__VA_ARGS__);                   \
-    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
-      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
-      return;                                               \
-    }                                                       \
+#define OP_REQUIRES_OK(CTX, ...)                             \
+  do {                                                       \
+    ::tensorflow::Status _s(__VA_ARGS__);                    \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                         \
+      CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
+      return;                                                \
+    }                                                        \
   } while (0)
 
 #define OP_REQUIRES_ASYNC(CTX, EXP, STATUS, CALLBACK)  \
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index b76a3400a827514b1e2875fa5c8ab74398a95fdf..83dda6579b784be538f45d9c95be57d412f49668 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -965,7 +965,8 @@ BENCHMARK(BM_ConcatInputRange);
 BENCHMARK(BM_SelectInputRange);
 
 TEST(RegisteredKernels, CanCallGetAllRegisteredKernels) {
-  auto all_registered_kernels = GetAllRegisteredKernels().kernel();
+  auto kernel_list = GetAllRegisteredKernels();
+  auto all_registered_kernels = kernel_list.kernel();
   auto has_name_test1 = [](const KernelDef& k) { return k.op() == "Test1"; };
 
   // Verify we can find the "Test1" op registered above
@@ -986,5 +987,20 @@ TEST(RegisteredKernels, CanLogAllRegisteredKernels) {
   tensorflow::LogAllRegisteredKernels();
 }
 
+TEST(RegisteredKernels, GetFilteredRegisteredKernels) {
+  auto has_name_test1 = [](const KernelDef& k) { return k.op() == "Test1"; };
+  auto kernel_list = GetFilteredRegisteredKernels(has_name_test1);
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "Test1");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+}
+
+TEST(RegisteredKernels, GetRegisteredKernelsForOp) {
+  auto kernel_list = GetRegisteredKernelsForOp("Test1");
+  ASSERT_EQ(kernel_list.kernel_size(), 1);
+  EXPECT_EQ(kernel_list.kernel(0).op(), "Test1");
+  EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index d2f2609d3b80a70d8df7b080941f46781ed1f6b2..1b19ab5da3160d73f5d962bdfb798b6dedef3af1 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -482,6 +482,7 @@ class Tensor {
   friend class VariableOp;            // For access to set_shape
   friend class AutoReloadVariableOp;  // For access to set_shape
   friend class TensorTestHelper;      // For access to set_shape
+  friend class CastOpBase;            // For access to set_dtype;
   friend class OpKernelContext;       // For access to RefCountIsOne().
   friend class ScopedAllocator;       // For access to buf_.
   friend class XlaTensor;             // For access to RefCountIsOne().
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 1778e48ef649346f27fa0419518e5a4803b51888..8e1e56d29bc474dedf7c0b01dbdf8099ebf86c4d 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <vector>
 
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -54,10 +55,11 @@ Status ValidateControlFlowInfo(const Graph* graph,
       frame.parent = parent;
       frame.name = cf.frame_name;
     } else if (frame.parent != parent) {
-      return errors::InvalidArgument(
+      return errors::Internal(
           "Invalid loop structure: Mismatched parent frames for \"",
           cf.frame_name, "\": \"", parent->name, "\" vs \"", frame.parent->name,
-          "\". This is an internal bug, please file a bug report with "
+          "\". The node giving this error: ", FormatNodeForError(*node),
+          "This is an internal bug, please file a bug report with "
           "instructions on how to reproduce the error.");
     }
     if (IsLoopCond(node)) {
@@ -69,9 +71,9 @@ Status ValidateControlFlowInfo(const Graph* graph,
           !str_util::StrContains(node->name(), "LoopCounter")) {
         return errors::InvalidArgument(
             "Invalid loop structure: Loop \"", cf.frame_name,
-            "\" has more than one LoopCond node: \"", node->name(), "\" and \"",
-            frame.loop_cond->name(),
-            "\". This is an internal bug, please file a bug report with "
+            "\" has more than one LoopCond node: ", FormatNodeForError(*node),
+            " and ", FormatNodeForError(*frame.loop_cond),
+            ". This is an internal bug, please file a bug report with "
             "instructions on how to reproduce the error.");
       }
       frame.loop_cond = node;
@@ -135,12 +137,11 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
           const string& parent_frame = (*info)[out_parent->id()].frame_name;
           if (parent_frame != frame_name) {
             return errors::InvalidArgument(
-                "The node '", out->name(),
-                "' has inputs from different "
-                "frames. The input '",
-                curr_node->name(), "' is in frame '", frame_name,
-                "'. The input '", parent_nodes[out->id()]->name(),
-                "' is in frame '", parent_frame, "'.");
+                FormatNodeForError(*out),
+                " has inputs from different frames. The input ",
+                FormatNodeForError(*curr_node), " is in frame '", frame_name,
+                "'. The input ", FormatNodeForError(*parent_nodes[out->id()]),
+                " is in frame '", parent_frame, "'.");
           }
         } else {
           out_info->frame = out;
@@ -148,7 +149,8 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
           TF_RETURN_IF_ERROR(
               GetNodeAttr(out->attrs(), "frame_name", &out_info->frame_name));
           if (out_info->frame_name.empty()) {
-            return errors::InvalidArgument("The Enter node ", out->name(),
+            return errors::InvalidArgument("The Enter ",
+                                           FormatNodeForError(*out),
                                            " must have a frame name.");
           }
         }
@@ -156,12 +158,11 @@ Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
         if (is_visited) {
           if (out_info->frame_name != frame_name) {
             return errors::InvalidArgument(
-                "The node '", out->name(),
-                "' has inputs from different "
-                "frames. The input '",
-                curr_node->name(), "' is in frame '", frame_name,
-                "'. The input '", parent_nodes[out->id()]->name(),
-                "' is in frame '", out_info->frame_name, "'.");
+                FormatNodeForError(*out),
+                " has inputs from different frames. The input ",
+                FormatNodeForError(*curr_node), " is in frame '", frame_name,
+                "'. The input ", FormatNodeForError(*parent_nodes[out->id()]),
+                " is in frame '", out_info->frame_name, "'.");
           }
         } else {
           out_info->frame = frame;
diff --git a/tensorflow/core/graph/control_flow_test.cc b/tensorflow/core/graph/control_flow_test.cc
index eb7937400ff0ea23dc75b1538f3a8bd5373a93b6..803c757c3ffb3e14a334301de88d87a9f54a3b6b 100644
--- a/tensorflow/core/graph/control_flow_test.cc
+++ b/tensorflow/core/graph/control_flow_test.cc
@@ -63,6 +63,15 @@ TEST(ValidateControlFlowTest, InputsFromDifferentFrames) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "has inputs from different frames"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "{{node outer/body/inner/Merge}}"))
+      << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "{{node outer/body/inner/Enter}}"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node outer/Switch}}"))
+      << status.error_message();
 }
 
 TEST(ValidateControlFlowTest, MismatchedParentFrames) {
@@ -102,6 +111,8 @@ TEST(ValidateControlFlowTest, MismatchedParentFrames) {
   EXPECT_TRUE(
       str_util::StrContains(status.error_message(), "Mismatched parent frames"))
       << status.error_message();
+  EXPECT_TRUE(str_util::StrContains(status.error_message(), "{{node Enter2}}"))
+      << status.error_message();
 }
 
 TEST(ValidateControlFlowTest, TwoLoopCond) {
@@ -125,6 +136,12 @@ TEST(ValidateControlFlowTest, TwoLoopCond) {
   EXPECT_TRUE(str_util::StrContains(status.error_message(),
                                     "more than one LoopCond node"))
       << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node sub/LoopCond}}"))
+      << status.error_message();
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "{{node LoopCond}}"))
+      << status.error_message();
 }
 
 }  // namespace
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1b1941f9c19cf64420be0f088c99f00768c15b53..ea0a814ab862b3c0cb50625acd2ad843eaa887b8 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -214,6 +214,14 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info,
       cast_builder.Attr("_start_time", start_time);
     }
     cast_builder.Attr("DstT", cast_dtype);
+
+    if (cast_dtype == DT_BFLOAT16) {
+      // the below attribute specifies that the cast to bfloat16 should use
+      // truncation. This is needed to retain legacy behavior when we change
+      // the default bfloat16 casts to use rounding instead of truncation
+      cast_builder.Attr("Truncate", true);
+    }
+
     NodeDef* cast = gdef->add_node();
     *status = cast_builder.Finalize(cast);
     if (!status->ok()) return nullptr;
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 5f51d6083b1ae17d8c4dee2434f4b57de5f18d06..333bf761b0a461957107d155b7615520614bf5fa 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPH_MKL_GRAPH_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index b9667998d6143b24fee4bc6d23527bf5135331a1..3e769b53033e61f34d349b1b705b4bb024fb1c4a 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
-#include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index fc474c0dc8217c2efb943b7f84e50377dd2bee55..f2bffa2113570c470185c1c61c4b1f5529967003 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc
index e9ced4d2b6b2e7bffa0fbe61f546bef0aa9db974..aa39af637fbdb6180fabeb6a3629f672d9ae2809 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
-#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
index bbdbe78bbd863e2d90cccfa8c1f649a33ac97c9e..ebcb6de551ebdd476b61dfe54553a3870571ca39 100644
--- a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
+++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 
 #include <algorithm>
-#include <string>
 #include <vector>
 
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 9dcc6765f5b356438c325f84c4891d70e0089efd..7c6fe56e1f2f743bf74e3968eda01e58742ab008 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -33,6 +33,7 @@ tf_cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
     deps = [
+        ":grappler_item",
         ":utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
@@ -151,3 +152,32 @@ tf_cc_test(
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
+
+cc_library(
+    name = "mutable_graph_view",
+    srcs = [
+        "mutable_graph_view.cc",
+    ],
+    hdrs = ["mutable_graph_view.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_view",
+        ":grappler_item",
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "mutable_graph_view_test",
+    srcs = ["mutable_graph_view_test.cc"],
+    deps = [
+        ":grappler_item",
+        ":mutable_graph_view",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 83a8326e796438fa2ddb20a246035f9f5ae0fbcd..231c7c63bea1b0fe42bb00b9443a3af380eccf3b 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -496,18 +496,11 @@ class SymbolicShapeRefiner {
             "supported.");
       }
 
+      // It is guaranteed that output_tensors does not contain any control
+      // inputs, so port_id >= 0.
       string out_tensor = out_arg.output_tensors[0];
-      auto out_tensor_pieces = str_util::Split(out_tensor, ",");
-      string node_name = out_tensor_pieces[0];
       int port_id;
-
-      // Check if port_id was included in out_tensor
-      if (out_tensor_pieces.size() <= 1) {
-        port_id = 0;
-      } else if (!strings::safe_strto32(out_tensor_pieces[1], &port_id)) {
-        return errors::FailedPrecondition(
-            "Failed string to integer conversion for ", out_tensor_pieces[1]);
-      }
+      string node_name = ParseNodeName(out_tensor, &port_id);
 
       const NodeDef* retnode = gv.GetNode(node_name);
       if (retnode == nullptr) {
@@ -516,6 +509,11 @@ class SymbolicShapeRefiner {
       }
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
+      if (port_id >= output_properties.size()) {
+        return errors::InvalidArgument(
+            out_tensor, " has invalid position ", port_id,
+            " (output_properties.size() = ", output_properties.size(), ").");
+      }
       auto const& outprop = output_properties[port_id];
       const TensorShapeProto& shape = outprop.shape();
       ShapeHandle out;
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 1be19d291a5008483568242347583a4e8aea1e30..5acfb56b05c87761f1b3996e90572ed7cc4b9c13 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -887,6 +887,44 @@ TEST_F(GraphPropertiesTest, LargeFunctionStaticShapeInference) {
   EXPECT_EQ(8, in_prop3.shape().dim(3).size());
 }
 
+TEST_F(GraphPropertiesTest, LargeFunctionWithMultipleOutputs) {
+  // Test graph produced in python using:
+  /*
+    @function.Defun(noinline=True)
+    def MyFunc():
+      @function.Defun(*[tf.float32] * 2)
+      def Cond(n, unused_x):
+        return n > 0
+
+      @function.Defun(*[tf.float32] * 2)
+      def Body(n, x):
+        return n - 1, x + n
+
+      i = tf.constant(10)
+      return functional_ops.While([i, 0.], Cond, Body)
+
+    with tf.Graph().as_default():
+      z = MyFunc()
+  */
+  GrapplerItem item;
+  string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
+                                 "function_functional_while.pbtxt");
+  TF_CHECK_OK(ReadGraphDefFromFile(filename, &item.graph));
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+
+  const auto out_props = properties.GetOutputProperties("MyFunc_AenMyWWx1Us");
+  EXPECT_EQ(2, out_props.size());
+
+  const OpInfo::TensorProperties& out_prop0 = out_props[0];
+  EXPECT_EQ(DT_INT32, out_prop0.dtype());
+  EXPECT_FALSE(out_prop0.shape().unknown_rank());
+
+  const OpInfo::TensorProperties& out_prop1 = out_props[1];
+  EXPECT_EQ(DT_FLOAT, out_prop1.dtype());
+  EXPECT_FALSE(out_prop1.shape().unknown_rank());
+}
+
 TEST_F(GraphPropertiesTest, FunctionWithErrorStaticShapeInference) {
   GrapplerItem item;
   string filename = io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataPath,
diff --git a/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c94ee2f22775ee4ffda3397972f59577350636b4
--- /dev/null
+++ b/tensorflow/core/grappler/costs/graph_properties_testdata/function_functional_while.pbtxt
@@ -0,0 +1,239 @@
+node {
+  name: "MyFunc_AenMyWWx1Us"
+  op: "MyFunc_AenMyWWx1Us"
+}
+library {
+  function {
+    signature {
+      name: "MyFunc_AenMyWWx1Us"
+      output_arg {
+        name: "while"
+        type: DT_INT32
+      }
+      output_arg {
+        name: "while_0"
+        type: DT_FLOAT
+      }
+      is_stateful: true
+    }
+    node_def {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT32
+            tensor_shape {
+            }
+            int_val: 10
+          }
+        }
+      }
+    }
+    node_def {
+      name: "While/input_1"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "While"
+      op: "While"
+      input: "Const:output:0"
+      input: "While/input_1:output:0"
+      attr {
+        key: "T"
+        value {
+          list {
+            type: DT_INT32
+            type: DT_FLOAT
+          }
+        }
+      }
+      attr {
+        key: "body"
+        value {
+          func {
+            name: "Body_8GOMGeZeK5c"
+          }
+        }
+      }
+      attr {
+        key: "cond"
+        value {
+          func {
+            name: "Cond_Xf5ttAHgUCg"
+          }
+        }
+      }
+    }
+    ret {
+      key: "while"
+      value: "While:output:0"
+    }
+    ret {
+      key: "while_0"
+      value: "While:output:1"
+    }
+    attr {
+      key: "_noinline"
+      value {
+        b: true
+      }
+    }
+  }
+  function {
+    signature {
+      name: "Body_8GOMGeZeK5c"
+      input_arg {
+        name: "n"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "x"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "sub"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "add"
+        type: DT_FLOAT
+      }
+    }
+    node_def {
+      name: "sub/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 1.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "sub_0"
+      op: "Sub"
+      input: "n"
+      input: "sub/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node_def {
+      name: "add_0"
+      op: "Add"
+      input: "x"
+      input: "n"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "add"
+      value: "add_0:z:0"
+    }
+    ret {
+      key: "sub"
+      value: "sub_0:z:0"
+    }
+  }
+  function {
+    signature {
+      name: "Cond_Xf5ttAHgUCg"
+      input_arg {
+        name: "n"
+        type: DT_FLOAT
+      }
+      input_arg {
+        name: "unused_x"
+        type: DT_FLOAT
+      }
+      output_arg {
+        name: "greater"
+        type: DT_BOOL
+      }
+    }
+    node_def {
+      name: "Greater/y"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_FLOAT
+            tensor_shape {
+            }
+            float_val: 0.0
+          }
+        }
+      }
+    }
+    node_def {
+      name: "Greater"
+      op: "Greater"
+      input: "n"
+      input: "Greater/y:output:0"
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "greater"
+      value: "Greater:z:0"
+    }
+  }
+}
+versions {
+  producer: 26
+  min_consumer: 12
+}
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index d34eecd00930c763684b205393d547330775f4a6..5b303f6ccb7beda7e3b6e179ebc3989282fd4afa 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -65,6 +65,7 @@ constexpr char kAvgPool[] = "AvgPool";
 constexpr char kAvgPoolGrad[] = "AvgPoolGrad";
 constexpr char kFusedBatchNorm[] = "FusedBatchNorm";
 constexpr char kFusedBatchNormGrad[] = "FusedBatchNormGrad";
+constexpr char kQuantizedMatMulV2[] = "QuantizedMatMulV2";
 
 static const Costs::Duration kMinComputeTime(1);
 
@@ -226,6 +227,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
+      {kQuantizedMatMulV2, wrap(&OpLevelCostEstimator::PredictMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
       {kGuaranteeConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
@@ -268,67 +270,70 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       EIGEN_COST(scalar_product_op<float>) + EIGEN_COST(scalar_max_op<float>) +
       EIGEN_COST(scalar_min_op<float>) + EIGEN_COST(scalar_round_op<float>);
 
-  elementwise_ops_ = {// Unary ops alphabetically sorted
-                      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
-                      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
-                      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
-                      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
-                                    EIGEN_COST(scalar_atan_op<float>)},
-                      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
-                      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
-                      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
-                      {"Erf", 1},
-                      {"Erfc", 1},
-                      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
-                      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
-                      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
-                      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
-                      {"InvGrad", 1},
-                      {"Lgamma", 1},
-                      {"Log", EIGEN_COST(scalar_log_op<float>)},
-                      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
-                      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
-                      {"QuantizeV2", quantize_v2_cost},
-                      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
-                      {"Rint", 1},
-                      {"Round", EIGEN_COST(scalar_round_op<float>)},
-                      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
-                      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
-                      {"Square", EIGEN_COST(scalar_square_op<float>)},
-                      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
-                      {"Relu", EIGEN_COST(scalar_max_op<float>)},
-                      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
-                      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
-                      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
-                      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
-                      // Binary ops alphabetically sorted
-                      {"Add", EIGEN_COST(scalar_sum_op<float>)},
-                      {"ApproximateEqual", 1},
-                      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
-                      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"Equal", 1},
-                      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
-                      {"Greater", 1},
-                      {"GreaterEqual", 1},
-                      {"Less", 1},
-                      {"LessEqual", 1},
-                      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
-                      {"LogicalNot", 1},
-                      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
-                      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
-                      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
-                      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
-                      {"Mul", EIGEN_COST(scalar_product_op<float>)},
-                      {"NotEqual", 1},
-                      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
-                      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
-                      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
-                      {"SquareDifference", 1},
-                      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
-                      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
-                      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
+  elementwise_ops_ = {
+      // Unary ops alphabetically sorted
+      {"Acos", EIGEN_COST(scalar_acos_op<float>)},
+      {"Asin", EIGEN_COST(scalar_asin_op<float>)},
+      {"Atan", EIGEN_COST(scalar_atan_op<float>)},
+      {"Atan2", EIGEN_COST(scalar_quotient_op<float>) +
+                    EIGEN_COST(scalar_atan_op<float>)},
+      {"Ceil", EIGEN_COST(scalar_ceil_op<float>)},
+      {"Cos", EIGEN_COST(scalar_cos_op<float>)},
+      {"Dequantize", EIGEN_COST(scalar_product_op<float>)},
+      {"Erf", 1},
+      {"Erfc", 1},
+      {"Exp", EIGEN_COST(scalar_exp_op<float>)},
+      {"Expm1", EIGEN_COST(scalar_expm1_op<float>)},
+      {"Floor", EIGEN_COST(scalar_floor_op<float>)},
+      {"Inv", EIGEN_COST(scalar_inverse_op<float>)},
+      {"InvGrad", 1},
+      {"Lgamma", 1},
+      {"Log", EIGEN_COST(scalar_log_op<float>)},
+      {"Log1p", EIGEN_COST(scalar_log1p_op<float>)},
+      {"Neg", EIGEN_COST(scalar_opposite_op<float>)},
+      {"QuantizeV2", quantize_v2_cost},
+      {"Reciprocal", EIGEN_COST(scalar_inverse_op<float>)},
+      {"Rint", 1},
+      {"Round", EIGEN_COST(scalar_round_op<float>)},
+      {"Rsqrt", EIGEN_COST(scalar_rsqrt_op<float>)},
+      {"Sqrt", EIGEN_COST(scalar_sqrt_op<float>)},
+      {"Square", EIGEN_COST(scalar_square_op<float>)},
+      {"Tanh", EIGEN_COST(scalar_tanh_op<float>)},
+      {"Relu", EIGEN_COST(scalar_max_op<float>)},
+      {"Sigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"QuantizedSigmoid", EIGEN_COST(scalar_sigmoid_op<float>)},
+      {"Sign", EIGEN_COST(scalar_sign_op<float>)},
+      {"Sin", EIGEN_COST(scalar_sin_op<float>)},
+      {"Tan", EIGEN_COST(scalar_tan_op<float>)},
+      // Binary ops alphabetically sorted
+      {"Add", EIGEN_COST(scalar_sum_op<float>)},
+      {"ApproximateEqual", 1},
+      {"BiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"QuantizedBiasAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"Div", EIGEN_COST(scalar_quotient_op<float>)},
+      {"Equal", 1},
+      {"FloorDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"FloorMod", EIGEN_COST(scalar_mod_op<float>)},
+      {"Greater", 1},
+      {"GreaterEqual", 1},
+      {"Less", 1},
+      {"LessEqual", 1},
+      {"LogicalAnd", EIGEN_COST(scalar_boolean_and_op)},
+      {"LogicalNot", 1},
+      {"LogicalOr", EIGEN_COST(scalar_boolean_or_op)},
+      {"Maximum", EIGEN_COST(scalar_max_op<float>)},
+      {"Minimum", EIGEN_COST(scalar_min_op<float>)},
+      {"Mod", EIGEN_COST(scalar_mod_op<float>)},
+      {"Mul", EIGEN_COST(scalar_product_op<float>)},
+      {"NotEqual", 1},
+      {"QuantizedAdd", EIGEN_COST(scalar_sum_op<float>)},
+      {"QuantizedMul", EIGEN_COST(scalar_product_op<float>)},
+      {"RealDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"ReluGrad", EIGEN_COST(scalar_max_op<float>)},
+      {"SquareDifference", 1},
+      {"Sub", EIGEN_COST(scalar_difference_op<float>)},
+      {"TruncateDiv", EIGEN_COST(scalar_quotient_op<float>)},
+      {"TruncateMod", EIGEN_COST(scalar_mod_op<float>)}};
 
 #undef EIGEN_COST
 
@@ -675,7 +680,7 @@ int64 OpLevelCostEstimator::CountMatMulOperations(
   }
 
   ops = m_dim * n_dim * k_dim * 2;
-  VLOG(1) << "Operations for Matmul" << ops;
+  VLOG(1) << "Operations for Matmul: " << ops;
 
   if (mat_mul != nullptr) {
     mat_mul->m = m_dim;
@@ -972,8 +977,10 @@ int64 OpLevelCostEstimator::CalculateTensorElementCount(
 
 int64 OpLevelCostEstimator::CalculateTensorSize(
     const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const {
-  return CalculateTensorElementCount(tensor, found_unknown_shapes) *
-         DataTypeSize(BaseType(tensor.dtype()));
+  int64 count = CalculateTensorElementCount(tensor, found_unknown_shapes);
+  int size = DataTypeSize(BaseType(tensor.dtype()));
+  VLOG(2) << "Count: " << count << " DataTypeSize: " << size;
+  return count * size;
 }
 
 int64 OpLevelCostEstimator::CalculateInputSize(
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 7f6827295079c6cd8665378dcd076c2195b1106f..6a1b0aebfa87cd1584e431390959dd6f4d0327b8 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -858,8 +859,9 @@ Costs VirtualScheduler::Summary() const {
     const auto& memory_cost = op_cost_pair.second.memory_time.count();
     const bool is_op_cost_accurate = !op_cost_pair.second.inaccurate;
     if (cost) {  // Skip printing out zero-cost ops.
-      VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-              << cost << " / " << compute_cost << " / " << memory_cost;
+      VLOG(1) << strings::Printf(" + %30s : %c %10ld / %10ld / %10ld",
+                                 op.c_str(), (is_op_cost_accurate ? ' ' : '~'),
+                                 cost, compute_cost, memory_cost);
     }
   }
 
@@ -934,9 +936,11 @@ Costs VirtualScheduler::Summary() const {
                                : 0.0;
       if (cost || mem_usage_percent > 1.0) {
         // Print out only non-zero cost ops or ops with > 1% memory usage.
-        VLOG(1) << " + " << op << " : " << (is_op_cost_accurate ? "" : "~")
-                << cost << " / " << compute_cost << " / " << memory_cost << " ("
-                << strings::HumanReadableNumBytes(op_mem_usage) << " ["
+        VLOG(1) << strings::Printf(" + %30s : %c %10ld / %10ld / %10ld",
+                                   op.c_str(),
+                                   (is_op_cost_accurate ? ' ' : '~'), cost,
+                                   compute_cost, memory_cost)
+                << " (" << strings::HumanReadableNumBytes(op_mem_usage) << " ["
                 << mem_usage_percent << "%] "
                 << (persisent_ops.count(op) > 0 ? ": persistent op)" : ")");
       }
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 3e448216f900f28927b7be69708df43d6e02177b..7998f0a902f12b747658463aac6c7071af1f2718 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -22,28 +22,33 @@ namespace grappler {
 GraphView::GraphView(GraphDef* graph) : graph_(graph) {
   for (int i = 0; i < graph_->node_size(); i++) {
     auto node = graph_->mutable_node(i);
-    auto rslt = nodes_.insert(std::make_pair(node->name(), node));
+    auto result = nodes_.emplace(node->name(), node);
     // Check that the graph doesn't contain multiple nodes with the same name.
-    CHECK(rslt.second) << "Non unique node name detected: " << node->name();
+    CHECK(result.second) << "Non unique node name detected: " << node->name();
   }
+
   for (NodeDef& node : *graph_->mutable_node()) {
-    for (int i = 0; i < node.input_size(); ++i) {
-      OutputPort fanin;
-      string fanin_name = ParseNodeName(node.input(i), &fanin.port_id);
-      fanin.node = nodes_[fanin_name];
+    AddFanouts(&node);
+  }
+}
 
-      InputPort input;
-      input.node = &node;
-      if (fanin.port_id < 0) {
-        input.port_id = -1;
-      } else {
-        input.port_id = i;
-        num_regular_outputs_[fanin.node] =
-            std::max(num_regular_outputs_[fanin.node], fanin.port_id);
-      }
+void GraphView::AddFanouts(NodeDef* node) {
+  for (int i = 0; i < node->input_size(); ++i) {
+    OutputPort fanin;
+    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
+    fanin.node = nodes_[fanin_name];
 
-      fanouts_[fanin].insert(input);
+    InputPort input;
+    input.node = node;
+    if (fanin.port_id < 0) {
+      input.port_id = -1;
+    } else {
+      input.port_id = i;
+      num_regular_outputs_[fanin.node] =
+          std::max(num_regular_outputs_[fanin.node], fanin.port_id);
     }
+
+    fanouts_[fanin].insert(input);
   }
 }
 
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 584cb9048b64fde5a6d790fe93748e06d83d3b26..050789d2e25bfdee01a8656c480e1e3fd6a39aca 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -29,8 +29,11 @@ namespace grappler {
 class GraphView {
  public:
   struct Port {
-    Port() : node(nullptr), port_id(-1) {}
+    Port() = default;
     Port(NodeDef* n, int port) : node(n), port_id(port) {}
+
+    // TODO(prazek): ports should keep the constness of GraphView.  The only way
+    // to modify graph through the view should be using MutableGraphView.
     NodeDef* node = nullptr;
     int port_id = -1;
 
@@ -111,13 +114,22 @@ class GraphView {
   std::unordered_set<Edge, HashEdge> GetFaninEdges(
       const NodeDef& node, bool include_controlling_edges) const;
 
+ protected:
+  // Add fanout to every `node` input.
+  void AddFanouts(NodeDef* node);
+  std::unordered_map<string, NodeDef*>* MutableNodes() { return &nodes_; }
+  GraphDef* MutableGraph() { return graph_; }
+
+  using FanoutsMapType =
+      std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
+                         HashPort>;
+  FanoutsMapType* MutableFanouts() { return &fanouts_; }
+
  private:
   GraphDef* graph_;
   std::unordered_map<string, NodeDef*> nodes_;
   std::unordered_set<InputPort, HashPort> empty_set_;
-  std::unordered_map<OutputPort, std::unordered_set<InputPort, HashPort>,
-                     HashPort>
-      fanouts_;
+  FanoutsMapType fanouts_;
   std::unordered_map<const NodeDef*, int> num_regular_outputs_;
 };
 
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6abafe11a20dfc7ddd5d57009694a66817111c08
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+NodeDef* MutableGraphView::AddNode(NodeDef&& node) {
+  auto* node_in_graph = GetGraph()->add_node();
+  *node_in_graph = std::move(node);
+
+  auto result = MutableNodes()->emplace(node_in_graph->name(), node_in_graph);
+  // Check that the graph doesn't contain multiple nodes with the same name.
+  CHECK(result.second) << "Non unique node name detected: "
+                       << node_in_graph->name();
+  AddFanouts(node_in_graph);
+  return node_in_graph;
+}
+
+void MutableGraphView::ReplaceInput(const NodeDef& old_input,
+                                    const NodeDef& new_input,
+                                    const int output_port_id) {
+  GraphView::OutputPort output_port =
+      GetOutputPort(old_input.name(), output_port_id);
+  auto fanout = GetFanout(output_port);
+  for (auto& input_port : fanout) {
+    input_port.node->set_input(input_port.port_id, new_input.name());
+    AddFanouts(input_port.node);
+  }
+}
+
+void MutableGraphView::DeleteNodes(const std::set<string>& nodes_to_delete) {
+  for (const string& node_name_to_delete : nodes_to_delete)
+    RemoveFanouts(MutableNodes()->at(node_name_to_delete));
+  for (const string& node_name_to_delete : nodes_to_delete)
+    MutableNodes()->erase(node_name_to_delete);
+  EraseNodesFromGraph(nodes_to_delete, GetGraph());
+}
+
+void MutableGraphView::RemoveFanouts(NodeDef* node) {
+  for (int i = 0; i < node->input_size(); ++i) {
+    OutputPort fanin;
+    string fanin_name = ParseNodeName(node->input(i), &fanin.port_id);
+    fanin.node = (*MutableNodes())[fanin_name];
+
+    InputPort input;
+    input.node = node;
+    if (fanin.port_id < 0)
+      input.port_id = -1;
+    else
+      input.port_id = i;
+
+    (*MutableFanouts())[fanin].erase(input);
+  }
+}
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/mutable_graph_view.h b/tensorflow/core/grappler/mutable_graph_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..105eb972e89cc5e155d4d060fc01d1a3f7ad7978
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
+
+#include "tensorflow/core/grappler/graph_view.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// A utility class to simplify the traversal of a GraphDef that, unlike
+// GraphView, supports updating the graph.  Note that you should not modify the
+// graph separately, because the view will get out of sync.
+class MutableGraphView : public GraphView {
+ public:
+  using GraphView::GraphView;
+
+  GraphDef* GetGraph() { return MutableGraph(); }
+  // Adds a new node to graph and updates the view.
+  NodeDef* AddNode(NodeDef&& node);
+
+  // Replaces the input for the output nodes of 'old_input' with a port
+  // `output_port_id` with 'new_input'.
+  //
+  // E.g: We have 2 nodes that use 'bar' node outputs as inputs:
+  // foo(bar:0, bar:1),  foo2(other:0, bar:0)
+  // Calling ReplaceInput(bar, new, 0) changes every occurrence of bar:0 for
+  // new:0.  Result:
+  // foo(new:0, bar:1),  foo2(other:0, new:0)
+  void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
+                    int output_port_id = 0);
+
+  // Deletes nodes from the graph.
+  void DeleteNodes(const std::set<string>& nodes_to_delete);
+
+ private:
+  void RemoveFanouts(NodeDef* node);
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_MUTABLE_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/mutable_graph_view_test.cc b/tensorflow/core/grappler/mutable_graph_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f09dfb8271f1d113a1b674ad08b8eba1108fea59
--- /dev/null
+++ b/tensorflow/core/grappler/mutable_graph_view_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MutableGraphViewTest, AddAndReplaceInput) {
+  // This outputs simple graph like:
+  //        x
+  //       / \
+  // Square   Square_1
+  //   |   \  /    |
+  //   |    \/     |
+  //   |    /\     |
+  //   |   /  \    |
+  //  AddN     AddN_1
+  //      \   /
+  //        y
+  TrivialTestGraphInputYielder fake_input(2, 2, 2, false, {"/CPU:0", "/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphDef new_graph = item.graph;
+  MutableGraphView graph(&new_graph);
+
+  GraphView::InputPort input = graph.GetInputPort("AddN", 0);
+  EXPECT_EQ("AddN", input.node->name());
+  EXPECT_EQ(0, input.port_id);
+  GraphView::OutputPort fanin = graph.GetRegularFanin(input);
+  EXPECT_EQ("Square", fanin.node->name());
+  EXPECT_EQ(0, fanin.port_id);
+
+  auto find_child_with_name = [&graph](string output_port_name,
+                                       string input_name) {
+    GraphView::OutputPort output_port =
+        graph.GetOutputPort(output_port_name, 0);
+    auto fanout = graph.GetFanout(output_port);
+    for (auto& input_port : fanout) {
+      if (input_port.node->name() == input_name) return true;
+    }
+    return false;
+  };
+
+  EXPECT_FALSE(find_child_with_name("Square", "new_node"));
+
+  NodeDef new_node = *input.node;
+  new_node.set_name("new_node");
+
+  EXPECT_EQ(graph.GetNode("new_node"), nullptr);
+  NodeDef* node_in_graph = graph.AddNode(std::move(new_node));
+  EXPECT_NE(graph.GetNode("new_node"), nullptr);
+
+  graph.ReplaceInput(*input.node, *node_in_graph);
+  EXPECT_TRUE(find_child_with_name("Square", "new_node"));
+  EXPECT_TRUE(find_child_with_name("new_node", "y"));
+}
+
+TEST(MutableGraphViewTest, DeleteNodes) {
+  // Outputs simple graph as described in first test.
+  TrivialTestGraphInputYielder fake_input(2, 2, 2, false, {"/CPU:0", "/GPU:0"});
+  GrapplerItem item;
+  CHECK(fake_input.NextItem(&item));
+
+  GraphDef new_graph = item.graph;
+  MutableGraphView graph(&new_graph);
+
+  EXPECT_NE(graph.GetNode("AddN"), nullptr);
+  graph.DeleteNodes({"AddN"});
+
+  EXPECT_EQ(graph.GetNode("AddN"), nullptr);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 97862d1ed04f824f6a57e12919cccd9c6950912f..3ab22116940ac20512e664505a2c7865c9050b6c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -3007,14 +3007,7 @@ void ArithmeticOptimizer::DedupComputations() {
 
   // Delete duplicates
   if (fetch_nodes_known_ && !duplicates.empty()) {
-    int last = optimized_graph_->node_size() - 1;
-    for (auto it = duplicates.rbegin(); it != duplicates.rend(); ++it) {
-      int index = *it;
-      optimized_graph_->mutable_node()->SwapElements(index, last);
-      last--;
-    }
-    optimized_graph_->mutable_node()->DeleteSubrange(last + 1,
-                                                     duplicates.size());
+    EraseNodesFromGraph(duplicates, optimized_graph_);
     // Rebuild the NodeMap which was invalidated by the node  swapping above.
     node_map_.reset(new NodeMap(optimized_graph_));
   }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 76c928f99581c87b795b8eca59082b16e78940b4..f016fae3a58a1f673f5affb5dee7759ee070c810 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1305,17 +1305,12 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
   }
 
   // Delete the newly created nodes that don't feed anything.
-  int last = output->node_size() - 1;
-  for (int i = output->node_size() - 1; i >= 0; --i) {
-    const NodeDef& node = output->node(i);
-    auto fanout = node_map_->GetOutputs(node.name());
-    if (fanout.empty()) {
-      output->mutable_node()->SwapElements(i, last);
-      last--;
-    }
+  std::vector<int> nodes_to_delete;
+  for (int i = 0; i < output->node_size(); i++) {
+    auto fanout = node_map_->GetOutputs(output->node(i).name());
+    if (fanout.empty()) nodes_to_delete.push_back(i);
   }
-  output->mutable_node()->DeleteSubrange(last + 1,
-                                         output->node_size() - last - 1);
+  EraseNodesFromGraph(std::move(nodes_to_delete), output);
 
   for (const auto& node : graph_->node()) {
     // If no fetch nodes is provided, we conservatively
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index c8946c499c67a55476a78bb784a243694e2885b8..d7ac58c99da5311e5a37299be466f0f8d555d6ff 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -46,7 +46,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
     ] + tf_protos_all(),
@@ -58,8 +58,14 @@ tf_cc_test(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_utils",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/kernels:cast_op",
     ],
 )
 
@@ -73,7 +79,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -97,6 +103,43 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "map_fusion",
+    srcs = ["map_fusion.cc"],
+    hdrs = [
+        "map_fusion.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/kernels:cast_op",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "map_fusion_test",
+    srcs = ["map_fusion_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":map_fusion",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "noop_elimination",
     srcs = ["noop_elimination.cc"],
@@ -107,7 +150,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -141,7 +184,7 @@ cc_library(
     deps = [
         ":graph_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -171,6 +214,7 @@ cc_library(
     deps = [
         ":function_rename",
         ":map_and_batch_fusion",
+        ":map_fusion",
         ":noop_elimination",
         ":shuffle_and_repeat_fusion",
     ],
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index ea5f4500090855217ef0a9ef559a9e00b1454178..6ce65333695ea1ad8d73652ac69d82153e6a0bae 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 
 #include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -26,14 +26,12 @@ namespace {
 
 constexpr char kConstOpName[] = "Const";
 
-int FindNodeWithPredicate(const std::function<bool(const NodeDef&)>& predicate,
-                          const GraphDef& graph) {
-  for (int i = 0; i < graph.node_size(); ++i) {
-    if (predicate(graph.node(i))) {
-      return i;
-    }
-  }
-  return -1;
+template <typename Predicate, typename Collection>
+int GetElementIdxWithPredicate(const Predicate& predicate,
+                               const Collection& collection) {
+  auto it = std::find_if(collection.begin(), collection.end(), predicate);
+  if (it == collection.end()) return -1;
+  return std::distance(collection.begin(), it);
 }
 
 std::vector<int> CreateNameIndex(const GraphDef& graph) {
@@ -62,13 +60,14 @@ std::vector<int> CreateInputIndex(const NodeDef& node) {
   return index;
 }
 
-Status AddScalarConstNodeHelper(
+NodeDef* AddScalarConstNodeHelper(
     DataType dtype, const std::function<void(TensorProto*)>& add_value,
-    GraphDef* graph, NodeDef** result) {
-  NodeDef* node = graph->add_node();
-  node->set_op(kConstOpName);
-  SetUniqueName(kConstOpName, graph, node);
-  (*node->mutable_attr())["dtype"].set_type(dtype);
+    MutableGraphView* graph) {
+  NodeDef node;
+  node.set_op(kConstOpName);
+  SetUniqueGraphNodeName(kConstOpName, graph->GetGraph(), &node);
+
+  (*node.mutable_attr())["dtype"].set_type(dtype);
   std::unique_ptr<tensorflow::TensorProto> tensor =
       tensorflow::MakeUnique<tensorflow::TensorProto>();
   std::unique_ptr<tensorflow::TensorShapeProto> tensor_shape =
@@ -76,75 +75,69 @@ Status AddScalarConstNodeHelper(
   tensor->set_allocated_tensor_shape(tensor_shape.release());
   tensor->set_dtype(dtype);
   add_value(tensor.get());
-  (*node->mutable_attr())["value"].set_allocated_tensor(tensor.release());
-  *result = node;
-  return Status::OK();
+  (*node.mutable_attr())["value"].set_allocated_tensor(tensor.release());
+
+  return graph->AddNode(std::move(node));
 }
 
 }  // namespace
 
-Status AddNode(const string& name, const string& op,
-               const std::vector<string>& inputs,
-               const std::vector<std::pair<string, AttrValue>>& attributes,
-               GraphDef* graph, NodeDef** result) {
-  NodeDef* node = graph->add_node();
+NodeDef* AddNode(const string& name, const string& op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 MutableGraphView* graph) {
+  NodeDef node;
   if (!name.empty()) {
-    node->set_name(name);
+    node.set_name(name);
   } else {
-    SetUniqueName(op, graph, node);
+    SetUniqueGraphNodeName(op, graph->GetGraph(), &node);
   }
-  node->set_op(op);
+  node.set_op(op);
   for (const string& input : inputs) {
-    node->add_input(input);
+    node.add_input(input);
   }
   for (auto attr : attributes) {
-    (*node->mutable_attr())[attr.first] = attr.second;
+    (*node.mutable_attr())[attr.first] = attr.second;
   }
-  *result = node;
-  return Status::OK();
+  return graph->AddNode(std::move(node));
 }
 
 template <>
-Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_BOOL, [v](TensorProto* proto) { proto->add_bool_val(v); }, graph,
-      result);
+      DT_BOOL, [v](TensorProto* proto) { proto->add_bool_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(double v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_DOUBLE, [v](TensorProto* proto) { proto->add_double_val(v); }, graph,
-      result);
+      DT_DOUBLE, [v](TensorProto* proto) { proto->add_double_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(float v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_FLOAT, [v](TensorProto* proto) { proto->add_float_val(v); }, graph,
-      result);
+      DT_FLOAT, [v](TensorProto* proto) { proto->add_float_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(int v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_INT32, [v](TensorProto* proto) { proto->add_int_val(v); }, graph,
-      result);
+      DT_INT32, [v](TensorProto* proto) { proto->add_int_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(int64 v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
-      DT_INT64, [v](TensorProto* proto) { proto->add_int64_val(v); }, graph,
-      result);
+      DT_INT64, [v](TensorProto* proto) { proto->add_int64_val(v); }, graph);
 }
 
 template <>
-Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result) {
+NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph) {
   return AddScalarConstNodeHelper(
       DT_STRING,
       [v](TensorProto* proto) { proto->add_string_val(v.data(), v.size()); },
-      graph, result);
+      graph);
 }
 
 bool Compare(const GraphDef& g1, const GraphDef& g2) {
@@ -177,52 +170,82 @@ bool Compare(const GraphDef& g1, const GraphDef& g2) {
   return true;
 }
 
-bool ContainsNodeWithName(const string& name, const GraphDef& graph) {
-  return FindNodeWithName(name, graph) != -1;
+bool ContainsGraphNodeWithName(const string& name, const GraphDef& graph) {
+  return FindGraphNodeWithName(name, graph) != -1;
 }
 
 bool ContainsNodeWithOp(const string& op, const GraphDef& graph) {
   return FindNodeWithOp(op, graph) != -1;
 }
 
-Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph) {
-  int last = graph->node_size() - 1;
-  for (int i = graph->node_size() - 1; i >= 0; --i) {
-    const NodeDef& node = graph->node(i);
-    if (nodes_to_delete.find(node.name()) != nodes_to_delete.end()) {
-      graph->mutable_node()->SwapElements(i, last);
-      last--;
-    }
-  }
-  graph->mutable_node()->DeleteSubrange(last + 1,
-                                        graph->node_size() - last - 1);
-  return Status::OK();
+bool ContainsGraphFunctionWithName(const string& name,
+                                   const FunctionDefLibrary& library) {
+  return FindGraphFunctionWithName(name, library) != -1;
 }
 
-int FindNodeWithName(const string& name, const GraphDef& graph) {
-  return FindNodeWithPredicate(
-      [name](const NodeDef& node) { return node.name() == name; }, graph);
+bool ContainsFunctionNodeWithName(const string& name,
+                                  const FunctionDef& function) {
+  return FindFunctionNodeWithName(name, function) != -1;
+}
+
+int FindGraphNodeWithName(const string& name, const GraphDef& graph) {
+  return GetElementIdxWithPredicate(
+      [&name](const NodeDef& node) { return node.name() == name; },
+      graph.node());
 }
 
 int FindNodeWithOp(const string& op, const GraphDef& graph) {
-  return FindNodeWithPredicate(
-      [op](const NodeDef& node) { return node.op() == op; }, graph);
+  return GetElementIdxWithPredicate(
+      [&op](const NodeDef& node) { return node.op() == op; }, graph.node());
 }
 
-void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node) {
+int FindGraphFunctionWithName(const string& name,
+                              const FunctionDefLibrary& library) {
+  return GetElementIdxWithPredicate(
+      [&name](const FunctionDef& function) {
+        return function.signature().name() == name;
+      },
+      library.function());
+}
+
+int FindFunctionNodeWithName(const string& name, const FunctionDef& function) {
+  return GetElementIdxWithPredicate(
+      [&name](const NodeDef& node) { return node.name() == name; },
+      function.node_def());
+}
+
+void SetUniqueGraphNodeName(const string& prefix, GraphDef* graph,
+                            NodeDef* node) {
+  string name = prefix;
   int id = graph->node_size();
-  while (ContainsNodeWithName(strings::StrCat(op, "/_", id), *graph)) {
+  while (ContainsGraphNodeWithName(name, *graph)) {
+    name = strings::StrCat(prefix, "/_", id);
     ++id;
   }
-  node->set_name(strings::StrCat(op, "/_", id));
+  node->set_name(std::move(name));
 }
 
-void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                  GraphView* graph) {
-  GraphView::OutputPort output_port = graph->GetOutputPort(old_input.name(), 0);
-  auto fanout = graph->GetFanout(output_port);
-  for (auto& input_port : fanout)
-    input_port.node->set_input(0, new_input.name());
+void SetUniqueFunctionNodeName(const string& prefix, FunctionDef* function,
+                               NodeDef* node) {
+  string name = prefix;
+  int id = function->node_def_size();
+  while (ContainsFunctionNodeWithName(name, *function)) {
+    name = strings::StrCat(prefix, "/_", id);
+    ++id;
+  }
+  node->set_name(std::move(name));
+}
+
+void SetUniqueGraphFunctionName(const string& prefix,
+                                FunctionDefLibrary* library,
+                                FunctionDef* function) {
+  string name = prefix;
+  int id = library->function_size();
+  while (ContainsGraphFunctionWithName(name, *library)) {
+    name = strings::StrCat(prefix, "/_", id);
+    ++id;
+  }
+  function->mutable_signature()->set_name(name);
 }
 
 }  // end namespace graph_utils
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 1cb0f0c81d38a9d2187ac0be0528f06b52d2afee..0847748802800eb3b4b68cb30e7d3286cf751a8e 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
@@ -31,57 +32,83 @@ namespace grappler {
 namespace graph_utils {
 
 // Adds a node to the graph.
-Status AddNode(const string& name, const string& op,
-               const std::vector<string>& inputs,
-               const std::vector<std::pair<string, AttrValue>>& attributes,
-               GraphDef* graph, NodeDef** result);
+NodeDef* AddNode(const string& name, const string& op,
+                 const std::vector<string>& inputs,
+                 const std::vector<std::pair<string, AttrValue>>& attributes,
+                 MutableGraphView* graph);
 
 // Adds a Const node with the given value to the graph.
 template <typename T>
-Status AddScalarConstNode(T v, GraphDef* graph, NodeDef** result) {
-  return errors::Unimplemented("Type %s is not supported.",
-                               DataTypeToEnum<T>::value);
+NodeDef* AddScalarConstNode(T v, MutableGraphView* graph) {
+  // is_same is an idiomatic hack for making it compile if not instantiated.
+  // Replacing with false will result in a compile-time error.
+  static_assert(!std::is_same<T, T>::value,
+                "Invalid specialization of this method for type T.");
+  return {};
 }
+
 template <>
-Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(bool v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(double v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(float v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(int v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(int64 v, MutableGraphView* graph);
 template <>
-Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result);
+NodeDef* AddScalarConstNode(StringPiece v, MutableGraphView* graph);
 
 // Checks whether the two graphs are the same.
 bool Compare(const GraphDef& g1, const GraphDef& g2);
 
 // Checks whether the graph contains a node with the given name.
-bool ContainsNodeWithName(const string& name, const GraphDef& graph);
+bool ContainsGraphNodeWithName(const string& name, const GraphDef& graph);
+
+// Checks whether the library contains a function with the given name.
+bool ContainsGraphFunctionWithName(const string& name,
+                                   const FunctionDefLibrary& library);
+
+// Checks whether the function contains a node with the given name.
+bool ContainsFunctionNodeWithName(const string& name,
+                                  const FunctionDef& function);
 
 // Checks whether the graph contains a node with the given op.
 bool ContainsNodeWithOp(const string& op, const GraphDef& graph);
 
-// Deletes nodes from the graph.
-Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph);
-
 // Returns the index of the node with the given name or -1 if the node does
 // not exist.
-int FindNodeWithName(const string& name, const GraphDef& graph);
+int FindGraphNodeWithName(const string& name, const GraphDef& graph);
+
+// Returns the index of the function with the given name or -1 if the function
+// does not exist.
+int FindGraphFunctionWithName(const string& name,
+                              const FunctionDefLibrary& library);
+
+// Returns the index of the function node with the given name or -1 if the
+// function node does not exist.
+int FindFunctionNodeWithName(const string& name, const FunctionDef& function);
 
 // Returns the index of a node with the given op or -1 if no such  node
 // exists.
 int FindNodeWithOp(const string& op, const GraphDef& graph);
 
-// Sets the node name using the op name as a prefix while guaranteeing the name
+// Sets the node name using `prefix` as a prefix while guaranteeing the name
 // is unique across the graph.
-void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node);
-
-// Replaces the input for the output nodes of 'old_input' with 'new_input'.
-void ReplaceInput(const NodeDef& old_input, const NodeDef& new_input,
-                  GraphView* graph);
+void SetUniqueGraphNodeName(const string& prefix, GraphDef* graph,
+                            NodeDef* node);
+
+// Sets the function node name using the `prefix` as a prefix while guaranteeing
+// the name is unique across the functions nodes.
+void SetUniqueFunctionNodeName(const string& prefix, FunctionDef* function,
+                               NodeDef* node);
+
+// Sets the node name using the `prefix` name as a prefix while guaranteeing the
+// name is unique across the graph.
+void SetUniqueGraphFunctionName(const string& prefix,
+                                FunctionDefLibrary* library,
+                                FunctionDef* function);
 
 }  // end namespace graph_utils
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index d723d73b7a738e76a0c7e2e8345d3836dde4e038..59ed79ab8f85fe51feb92027e2a204c08f469b70 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -24,149 +25,191 @@ namespace graph_utils {
 namespace {
 
 TEST(GraphUtilsTest, AddScalarConstNodeBool) {
-  GraphDef graph;
-  NodeDef* bool_node;
-  TF_EXPECT_OK(AddScalarConstNode<bool>(true, &graph, &bool_node));
-  EXPECT_TRUE(ContainsNodeWithName(bool_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* bool_node = AddScalarConstNode<bool>(true, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(bool_node->name(), *graph.GetGraph()));
   EXPECT_EQ(bool_node->attr().at("value").tensor().bool_val(0), true);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeDouble) {
-  GraphDef graph;
-  NodeDef* double_node;
-  TF_EXPECT_OK(AddScalarConstNode<double>(3.14, &graph, &double_node));
-  EXPECT_TRUE(ContainsNodeWithName(double_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* double_node = AddScalarConstNode<double>(3.14, &graph);
+  EXPECT_TRUE(
+      ContainsGraphNodeWithName(double_node->name(), *graph.GetGraph()));
   EXPECT_FLOAT_EQ(double_node->attr().at("value").tensor().double_val(0), 3.14);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeFloat) {
-  GraphDef graph;
-  NodeDef* float_node;
-  TF_EXPECT_OK(AddScalarConstNode<float>(3.14, &graph, &float_node));
-  EXPECT_TRUE(ContainsNodeWithName(float_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* float_node = AddScalarConstNode<float>(3.14, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(float_node->name(), *graph.GetGraph()));
   EXPECT_FLOAT_EQ(float_node->attr().at("value").tensor().float_val(0), 3.14);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeInt) {
-  GraphDef graph;
-  NodeDef* int_node;
-  TF_EXPECT_OK(AddScalarConstNode<int>(42, &graph, &int_node));
-  EXPECT_TRUE(ContainsNodeWithName(int_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int_node = AddScalarConstNode<int>(42, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(int_node->name(), *graph.GetGraph()));
   EXPECT_EQ(int_node->attr().at("value").tensor().int_val(0), 42);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeInt64) {
-  GraphDef graph;
-  NodeDef* int64_node;
-  TF_EXPECT_OK(AddScalarConstNode<int64>(42, &graph, &int64_node));
-  EXPECT_TRUE(ContainsNodeWithName(int64_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* int64_node = AddScalarConstNode<int64>(42, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName(int64_node->name(), *graph.GetGraph()));
   EXPECT_EQ(int64_node->attr().at("value").tensor().int64_val(0), 42);
 }
 
 TEST(GraphUtilsTest, AddScalarConstNodeString) {
-  GraphDef graph;
-  NodeDef* string_node;
-  TF_EXPECT_OK(AddScalarConstNode<StringPiece>("hello", &graph, &string_node));
-  EXPECT_TRUE(ContainsNodeWithName(string_node->name(), graph));
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  NodeDef* string_node = AddScalarConstNode<StringPiece>("hello", &graph);
+  EXPECT_TRUE(
+      ContainsGraphNodeWithName(string_node->name(), *graph.GetGraph()));
   EXPECT_EQ(string_node->attr().at("value").tensor().string_val(0), "hello");
 }
 
 TEST(GraphUtilsTest, Compare) {
-  GraphDef graphA;
-  GraphDef graphB;
-  EXPECT_TRUE(Compare(graphA, graphB));
+  GraphDef graph_def_a;
+  MutableGraphView graph_a(&graph_def_a);
+  GraphDef graph_def_b;
+  MutableGraphView graph_b(&graph_def_b);
 
-  NodeDef* nodeA;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graphA, &nodeA));
-  NodeDef* nodeB;
-  TF_EXPECT_OK(AddNode("B", "OpB", {"A"}, {}, &graphA, &nodeB));
-  EXPECT_FALSE(Compare(graphA, graphB));
+  EXPECT_TRUE(Compare(graph_def_a, graph_def_b));
 
-  graphB.mutable_node()->CopyFrom(graphA.node());
-  EXPECT_TRUE(Compare(graphA, graphB));
+  AddNode("A", "OpA", {}, {}, &graph_a);
+  AddNode("B", "OpB", {"A"}, {}, &graph_a);
+  EXPECT_FALSE(Compare(graph_def_a, graph_def_b));
+
+  graph_def_b.mutable_node()->CopyFrom(graph_def_a.node());
+  EXPECT_TRUE(Compare(graph_def_a, graph_def_b));
 }
 
-TEST(GraphUtilsTest, ContainsNodeWithName) {
-  GraphDef graph;
-  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+TEST(GraphUtilsTest, ContainsGraphNodeWithName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_TRUE(ContainsNodeWithName("A", graph));
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_TRUE(ContainsGraphNodeWithName("A", *graph.GetGraph()));
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(!ContainsGraphNodeWithName("A", *graph.GetGraph()));
 }
 
-TEST(GraphUtilsTest, ContainsNodeWithOp) {
-  GraphDef graph;
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+TEST(GraphUtilsTest, ContainsGraphFunctionWithName) {
+  FunctionDefLibrary library;
+  EXPECT_FALSE(ContainsGraphFunctionWithName("new_function", library));
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_TRUE(ContainsNodeWithOp("OpA", graph));
+  EXPECT_TRUE(
+      ContainsGraphFunctionWithName(new_function->signature().name(), library));
+}
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+TEST(GraphUtilsTest, ContainsFunctionNodeWithName) {
+  FunctionDef function = test::function::XTimesTwo();
+  EXPECT_FALSE(ContainsFunctionNodeWithName(
+      "weird_name_that_should_not_be_there", function));
+  EXPECT_TRUE(ContainsFunctionNodeWithName("two", function));
 }
 
-TEST(GraphUtilsTest, FindNodeWithName) {
-  GraphDef graph;
-  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+TEST(GraphUtilsTest, ContainsNodeWithOp) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_NE(FindNodeWithName("A", graph), -1);
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_TRUE(ContainsNodeWithOp("OpA", *graph.GetGraph()));
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+  graph.DeleteNodes({"A"});
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", *graph.GetGraph()));
 }
 
-TEST(GraphUtilsTest, FindNodeWithOp) {
-  GraphDef graph;
-  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+TEST(GraphUtilsTest, FindGraphNodeWithName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 
-  NodeDef* node;
-  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
-  EXPECT_NE(FindNodeWithOp("OpA", graph), -1);
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_NE(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 
-  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
-  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+  graph.DeleteNodes({"A"});
+  EXPECT_EQ(FindGraphNodeWithName("A", *graph.GetGraph()), -1);
 }
 
-TEST(GraphUtilsTest, SetUniqueName) {
-  GraphDef graph;
+TEST(GraphUtilsTest, FindFunctionWithName) {
+  FunctionDef function = test::function::XTimesTwo();
+  EXPECT_EQ(
+      FindFunctionNodeWithName("weird_name_that_should_not_be_there", function),
+      -1);
+  EXPECT_NE(FindFunctionNodeWithName("two", function), -1);
+}
 
-  NodeDef* node1;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node1));
-  NodeDef* node2;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node2));
-  EXPECT_NE(node1->name(), node2->name());
+TEST(GraphUtilsTest, FindGraphFunctionWithName) {
+  FunctionDefLibrary library;
+  EXPECT_EQ(FindGraphFunctionWithName("new_function", library), -1);
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  TF_EXPECT_OK(DeleteNodes({node1->name()}, &graph));
-  NodeDef* node3;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node3));
-  EXPECT_NE(node2->name(), node3->name());
+  EXPECT_NE(
+      FindGraphFunctionWithName(new_function->signature().name(), library), -1);
 }
 
-TEST(GraphUtilsTest, ReplaceInput) {
-  GraphDef graph;
+TEST(GraphUtilsTest, FindNodeWithOp) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
+  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+
+  AddNode("A", "OpA", {}, {}, &graph);
+  EXPECT_NE(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+
+  graph.DeleteNodes({"A"});
+  EXPECT_EQ(FindNodeWithOp("OpA", *graph.GetGraph()), -1);
+}
 
-  NodeDef* node1;
-  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node1));
+TEST(GraphUtilsTest, SetUniqueGraphNodeName) {
+  GraphDef graph_def;
+  MutableGraphView graph(&graph_def);
 
-  NodeDef* node2;
-  TF_EXPECT_OK(AddNode("", "A", {node1->name()}, {}, &graph, &node2));
+  NodeDef* node1 = AddNode("", "A", {}, {}, &graph);
+  NodeDef* node2 = AddNode("", "A", {}, {}, &graph);
+  EXPECT_NE(node1->name(), node2->name());
 
-  NodeDef* node3;
-  TF_EXPECT_OK(AddNode("", "A", {node2->name()}, {}, &graph, &node3));
+  graph.DeleteNodes({node1->name()});
+  NodeDef* node3 = AddNode("", "A", {}, {}, &graph);
+  EXPECT_NE(node2->name(), node3->name());
+}
 
-  EXPECT_EQ(node3->input(0), node2->name());
+TEST(GraphUtilsTest, SetUniqueFunctionNodeName) {
+  FunctionDef function = test::function::XTimesTwo();
+  NodeDef node;
+  SetUniqueFunctionNodeName("abc", &function, &node);
+  for (const NodeDef& function_node : function.node_def()) {
+    EXPECT_NE(node.name(), function_node.name());
+  }
+  auto* new_node = function.add_node_def();
+  *new_node = node;
+
+  NodeDef other;
+  SetUniqueFunctionNodeName("abc", &function, &other);
+  EXPECT_NE(other.name(), new_node->name());
+}
 
-  GraphView view(&graph);
-  ReplaceInput(*node2, *node1, &view);
+TEST(GraphUtilsTest, SetUniqueGraphFunctionName) {
+  FunctionDefLibrary library;
+  FunctionDef* new_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, new_function);
 
-  EXPECT_EQ(node3->input(0), node1->name());
+  FunctionDef* other_function = library.add_function();
+  SetUniqueGraphFunctionName("new_function", &library, other_function);
+  EXPECT_NE(new_function->signature().name(),
+            other_function->signature().name());
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index eac665bd92fd292827a68fcde3f9bb4c7e6e95cb..3ce238a30ad29eb8258d0b53ca59fadcb6b35742 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -32,12 +32,70 @@ namespace {
 
 constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
 
+NodeDef make_map_and_batch_node(const NodeDef& map_node,
+                                const NodeDef& batch_node,
+                                MutableGraphView* graph) {
+  NodeDef new_node;
+  new_node.set_op(kFusedOpName);
+  graph_utils::SetUniqueGraphNodeName(kFusedOpName, graph->GetGraph(),
+                                      &new_node);
+
+  // Set the `input` input argument.
+  new_node.add_input(map_node.input(0));
+
+  // Set the `other_arguments` input arguments.
+  int num_other_args;
+  if (map_node.op() == "ParallelMapDataset") {
+    num_other_args = map_node.input_size() - 2;
+  } else {
+    num_other_args = map_node.input_size() - 1;
+  }
+  for (int i = 0; i < num_other_args; i++) {
+    new_node.add_input(map_node.input(i + 1));
+  }
+
+  // Set the `batch_size` input argument.
+  new_node.add_input(batch_node.input(1));
+
+  // Set the `num_parallel_calls` input argument.
+  if (map_node.op() == "ParallelMapDataset") {
+    // The type of the `num_parallel_calls` argument in ParallelMapDataset
+    // and MapAndBatchDataset is different (int32 and int64 respectively)
+    // so we cannot reuse the same Const node and thus create a new one.
+    NodeDef* v = graph->GetNode(map_node.input(map_node.input_size() - 1));
+    NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(
+        v->attr().at("value").tensor().int_val(0), graph);
+    new_node.add_input(tmp->name());
+  } else {
+    NodeDef* tmp = graph_utils::AddScalarConstNode<int64>(1, graph);
+    new_node.add_input(tmp->name());
+  }
+
+  // Set the `drop_remainder` input argument.
+  if (batch_node.op() == "BatchDatasetV2") {
+    new_node.add_input(batch_node.input(2));
+  } else {
+    NodeDef* tmp = graph_utils::AddScalarConstNode<bool>(false, graph);
+    new_node.add_input(tmp->name());
+  }
+
+  // Set `f` and `Targuments` attributes.
+  for (auto key : {"f", "Targuments"}) {
+    (*new_node.mutable_attr())[key] = map_node.attr().at(key);
+  }
+  // Set `output_types` and `output_shapes` attributes.
+  for (auto key : {"output_shapes", "output_types"}) {
+    (*new_node.mutable_attr())[key] = batch_node.attr().at(key);
+  }
+  return new_node;
+}
+
 }  // namespace
 
 Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
@@ -45,79 +103,25 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // Use a more descriptive variable name now that we know the node type.
-    const NodeDef batch_node(node);
+    const NodeDef& batch_node = node;
     GraphView::InputPort input_port = graph.GetInputPort(batch_node.name(), 0);
     NodeDef* node2 = graph.GetRegularFanin(input_port).node;
     if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
       continue;
     }
-
-    NodeDef* new_node = output->add_node();
-    new_node->set_op(kFusedOpName);
-    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
-
     // Use a more descriptive variable name now that we know the node type.
     NodeDef* map_node = node2;
-    // Set the `input` input argument.
-    new_node->add_input(map_node->input(0));
-
-    // Set the `other_arguments` input arguments.
-    int num_other_args;
-    if (map_node->op() == "ParallelMapDataset") {
-      num_other_args = map_node->input_size() - 2;
-    } else {
-      num_other_args = map_node->input_size() - 1;
-    }
-    for (int i = 0; i < num_other_args; i++) {
-      new_node->add_input(map_node->input(i + 1));
-    }
 
-    // Set the `batch_size` input argument.
-    new_node->add_input(batch_node.input(1));
-
-    // Set the `num_parallel_calls` input argument.
-    if (map_node->op() == "ParallelMapDataset") {
-      // The type of the `num_parallel_calls` argument in ParallelMapDataset
-      // and MapAndBatchDataset is different (int32 and int64 respectively)
-      // so we cannot reuse the same Const node and thus create a new one.
-      NodeDef* v = graph.GetNode(map_node->input(map_node->input_size() - 1));
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(graph_utils::AddScalarConstNode<int64>(
-          v->attr().at("value").tensor().int_val(0), output, &tmp));
-      new_node->add_input(tmp->name());
-    } else {
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(
-          graph_utils::AddScalarConstNode<int64>(1, output, &tmp));
-      new_node->add_input(tmp->name());
-    }
-
-    // Set the `drop_remainder` input argument.
-    if (batch_node.op() == "BatchDatasetV2") {
-      new_node->add_input(batch_node.input(2));
-    } else {
-      NodeDef* tmp;
-      TF_RETURN_IF_ERROR(
-          graph_utils::AddScalarConstNode<bool>(false, output, &tmp));
-      new_node->add_input(tmp->name());
-    }
-
-    // Set `f` and `Targuments` attributes.
-    for (auto key : {"f", "Targuments"}) {
-      (*new_node->mutable_attr())[key] = map_node->attr().at(key);
-    }
-    // Set `output_types` and `output_shapes` attributes.
-    for (auto key : {"output_shapes", "output_types"}) {
-      (*new_node->mutable_attr())[key] = batch_node.attr().at(key);
-    }
+    auto* new_node =
+        graph.AddNode(make_map_and_batch_node(*map_node, batch_node, &graph));
+    graph.ReplaceInput(batch_node, *new_node);
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
     nodes_to_delete.insert(batch_node.name());
-
-    graph_utils::ReplaceInput(batch_node, *new_node, &graph);
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 3c1d8d5359ae8dd229e4e68b8b06667507d07a2f..a46c504ac48f265033a2935386684848377cdd10 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -27,25 +27,21 @@ namespace {
 
 TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
 
   NodeDef *map_node;
   {
@@ -59,13 +55,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
-                                      graph, &map_node));
+    map_node =
+        graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(2);
@@ -78,16 +72,18 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -96,11 +92,11 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
             1);
   NodeDef drop_remainder_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
   EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
                                  map_node->attr().at("f")));
@@ -114,25 +110,20 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
 
   NodeDef *map_node;
   {
@@ -146,16 +137,13 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
-                                      graph, &map_node));
+    map_node =
+        graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
-  NodeDef *drop_remainder_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<bool>(true, graph, &drop_remainder_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
+  NodeDef *drop_remainder_node =
+      graph_utils::AddScalarConstNode<bool>(true, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(3);
@@ -169,16 +157,18 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDatasetV2", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDatasetV2", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -187,7 +177,7 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
             1);
   EXPECT_EQ(map_and_batch_node.input(4), batch_node->input(2));
@@ -203,28 +193,22 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  MutableGraphView graph(&item.graph);
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
-  NodeDef *captured_input_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
-      "hello", graph, &captured_input_node));
-  NodeDef *num_parallel_calls_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int>(2, graph, &num_parallel_calls_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
+  NodeDef *captured_input_node =
+      graph_utils::AddScalarConstNode<StringPiece>("hello", &graph);
+  NodeDef *num_parallel_calls_node =
+      graph_utils::AddScalarConstNode<int>(2, &graph);
 
   NodeDef *map_node;
   {
@@ -239,13 +223,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
     AttrValue args_attr;
     SetAttrValue("Targuments", &args_attr);
     map_attrs[1] = std::make_pair("Targuments", args_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
-                                      map_attrs, graph, &map_node));
+    map_node = graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
+                                    map_attrs, &graph);
   }
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   NodeDef *batch_node;
   {
     std::vector<string> batch_inputs(2);
@@ -258,16 +240,18 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
     AttrValue types_attr;
     SetAttrValue("output_types", &types_attr);
     batch_attrs[1] = std::make_pair("output_types", types_attr);
-    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                      batch_attrs, graph, &batch_node));
+    batch_node = graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, &graph);
   }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
   NodeDef map_and_batch_node =
       output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
@@ -276,11 +260,11 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
   EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
   NodeDef num_parallel_calls_node2 = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(3), output));
   EXPECT_EQ(num_parallel_calls_node2.attr().at("value").tensor().int64_val(0),
             2);
   NodeDef drop_remainder_node = output.node(
-      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+      graph_utils::FindGraphNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
   EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
                                  map_node->attr().at("f")));
@@ -294,27 +278,21 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
 
 TEST(MapAndBatchFusionTest, NoChange) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    range_attrs, graph, &range_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             range_attrs, &graph);
 
-  NodeDef *batch_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_size_node = graph_utils::AddScalarConstNode<int64>(5, &graph);
   std::vector<string> batch_inputs(2);
   batch_inputs[0] = range_node->name();
   batch_inputs[1] = batch_size_node->name();
@@ -325,15 +303,13 @@ TEST(MapAndBatchFusionTest, NoChange) {
   AttrValue types_attr;
   SetAttrValue("output_types", &types_attr);
   batch_attrs[1] = std::make_pair("output_types", types_attr);
-  NodeDef *batch_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                    batch_attrs, graph, &batch_node));
+  graph_utils::AddNode("", "BatchDataset", batch_inputs, batch_attrs, &graph);
 
   MapAndBatchFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
new file mode 100644
index 0000000000000000000000000000000000000000..707f4a3407bb1ba155b7fd787049910699094381
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -0,0 +1,261 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+// Sets basic function parameters and copies attributes from parent and map
+// node.
+NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
+                      const FunctionDef& fused_function,
+                      MutableGraphView* graph) {
+  NodeDef fused_node;
+  graph_utils::SetUniqueGraphNodeName("fused_map", graph->GetGraph(),
+                                      &fused_node);
+
+  fused_node.set_op("MapDataset");
+  fused_node.add_input(parent_map_node.input(0));
+
+  auto copy_attribute = [](const string& attribute_name, const NodeDef& from,
+                           NodeDef* to) {
+    (*to->mutable_attr())[attribute_name] = from.attr().at(attribute_name);
+  };
+
+  auto attr = parent_map_node.attr().at("f");
+  *attr.mutable_func()->mutable_name() = fused_function.signature().name();
+  (*fused_node.mutable_attr())["f"] = std::move(attr);
+
+  copy_attribute("Targuments", parent_map_node, &fused_node);
+
+  for (auto key : {"output_shapes", "output_types"})
+    copy_attribute(key, map_node, &fused_node);
+
+  return fused_node;
+}
+
+string ParseNodeConnection(const string& name) {
+  // If input/output node name has semicolon, take the prefix.  Otherwise take
+  // the whole string.
+  return name.substr(0, name.find(':'));
+}
+
+string ParseOutputNode(const string& name) {
+  return name.substr(name.find(':'), string::npos);
+}
+
+const string& GetOutputNode(const FunctionDef& parent_function,
+                            int output_idx) {
+  const auto& ret_output_name =
+      parent_function.signature().output_arg(output_idx).name();
+  return parent_function.ret().at(ret_output_name);
+}
+
+// Nodes that will be added to the function can have the same name as the nodes
+// from parent function.  We need to rename them and the connections of the
+// inputs that refer to them.
+void RenameFunctionNodes(FunctionDef* fused_function,
+                         protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  std::unordered_map<string, string> changed_node_names;
+  for (NodeDef& function_node : *nodes_to_fuse) {
+    string name_before = function_node.name();
+    graph_utils::SetUniqueFunctionNodeName(name_before, fused_function,
+                                           &function_node);
+    if (name_before != function_node.name())
+      changed_node_names[name_before] = function_node.name();
+  }
+
+  auto update_name = [&changed_node_names](string* input) {
+    string input_node = ParseNodeConnection(*input);
+    if (changed_node_names.count(input_node) == 0) return;
+    const string& new_node_name = changed_node_names.at(input_node);
+    *input = new_node_name + ParseOutputNode(*input);
+  };
+
+  for (NodeDef& function_node : *nodes_to_fuse) {
+    for (string& input : *function_node.mutable_input()) {
+      update_name(&input);
+    }
+  }
+
+  for (auto& ret : *fused_function->mutable_ret()) update_name(&ret.second);
+}
+
+// This function adds new nodes and changes their input to the output nodes
+// of parent function.
+void FuseFunctionNodes(const FunctionDef& parent_function,
+                       const FunctionDef& function,
+                       protobuf::RepeatedPtrField<NodeDef>* nodes_to_fuse) {
+  const auto number_of_outputs = parent_function.signature().output_arg_size();
+  CHECK(number_of_outputs == function.signature().input_arg_size())
+      << "The number of input arguments of function "
+      << function.signature().name()
+      << " should be the same as the number of output arguments of function "
+      << parent_function.signature().name() << ".";
+
+  for (int output_idx = 0; output_idx < number_of_outputs; output_idx++) {
+    const string& output = GetOutputNode(parent_function, output_idx);
+
+    const auto& input_node_name =
+        function.signature().input_arg(output_idx).name();
+
+    for (NodeDef& function_node : *nodes_to_fuse) {
+      for (auto& node_input : *function_node.mutable_input()) {
+        auto parsed_name = ParseNodeConnection(node_input);
+        if (parsed_name != input_node_name) continue;
+
+        node_input = output;
+      }
+    }
+  }
+}
+
+// This function looks for direct edges from input to return and rewrites
+// them to the coresponding input of the return of parent_function.
+void FuseReturns(const FunctionDef& parent_function,
+                 const FunctionDef& function, FunctionDef* fused_function) {
+  const auto number_of_inputs = function.signature().input_arg_size();
+
+  for (auto& ret : *fused_function->mutable_ret()) {
+    auto return_input = ParseNodeConnection(ret.second);
+    for (int input_idx = 0; input_idx < number_of_inputs; input_idx++) {
+      const auto& input_arg = function.signature().input_arg(input_idx);
+      if (return_input != input_arg.name()) continue;
+
+      ret.second = GetOutputNode(parent_function, input_idx);
+    }
+  }
+}
+
+// This function produces new function that is a result of fusion of
+// `parent_function` with `function`.
+FunctionDef* FuseFunctions(const FunctionDef& parent_function,
+                           const FunctionDef& function,
+                           FunctionDefLibrary* library) {
+  FunctionDef* fused_function = library->add_function();
+  graph_utils::SetUniqueGraphFunctionName("fused_function", library,
+                                          fused_function);
+
+  // Copy input signature from parent function.
+  *fused_function->mutable_signature()->mutable_input_arg() =
+      parent_function.signature().input_arg();
+
+  fused_function->mutable_node_def()->CopyFrom(parent_function.node_def());
+  // This code assumes functions does not have any attributes. If this is
+  // not the case, we need to merge attributes and fix name conflicts.
+  CHECK(parent_function.attr_size() == 0 && function.attr_size() == 0 &&
+        "Functions with attributes are currently not supported");
+
+  // Copy the returns and output signature from the second node.
+  auto nodes_to_fuse = function.node_def();
+  fused_function->mutable_signature()->mutable_output_arg()->CopyFrom(
+      function.signature().output_arg());
+  *fused_function->mutable_ret() = function.ret();
+
+  RenameFunctionNodes(fused_function, &nodes_to_fuse);
+  FuseFunctionNodes(parent_function, function, &nodes_to_fuse);
+  FuseReturns(parent_function, function, fused_function);
+
+  // Copy transformed nodes from the second function.
+  fused_function->mutable_node_def()->MergeFrom(nodes_to_fuse);
+
+  return fused_function;
+}
+
+}  // namespace
+
+Status MapFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
+                           GraphDef* output) {
+  GraphDef sorted_old_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&sorted_old_graph));
+  *output = sorted_old_graph;
+
+  MutableGraphView graph(output);
+  std::set<string> nodes_to_delete;
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             item.graph.library());
+
+  auto get_map_node = [](const NodeDef& node) -> const NodeDef* {
+    // TODO(prazek): we could also handle ParallelMapDataset and
+    // MapAndBatchDataset.
+    if (node.op() == "MapDataset") return &node;
+    return nullptr;
+  };
+
+  auto get_fused_function = [&function_library, &output](
+                                const NodeDef* parent_map_node,
+                                const NodeDef* map_node) {
+    const auto& parent_fun = parent_map_node->attr().at("f");
+    const FunctionDef* parent_func =
+        function_library.Find(parent_fun.func().name());
+    const auto& fun = map_node->attr().at("f");
+    const FunctionDef* func = function_library.Find(fun.func().name());
+
+    return FuseFunctions(*parent_func, *func, output->mutable_library());
+  };
+
+  for (const NodeDef& node : sorted_old_graph.node()) {
+    const NodeDef* map_node = get_map_node(node);
+    if (!map_node) continue;
+
+    GraphView::InputPort input_port = graph.GetInputPort(map_node->name(), 0);
+    const NodeDef* parent_map_node =
+        get_map_node(*graph.GetRegularFanin(input_port).node);
+    if (!parent_map_node) continue;
+
+    const auto* fused_function = get_fused_function(parent_map_node, map_node);
+    const auto* fused_maps_node = graph.AddNode(
+        MakeFusedNode(*parent_map_node, *map_node, *fused_function, &graph));
+
+    graph.ReplaceInput(*map_node, *fused_maps_node);
+
+    // TODO(prazek): we should run some optimizations on the fused map
+    // functions, or make sure that optimization passes run after map
+    // fusion.
+    TF_RETURN_IF_ERROR(function_library.AddFunctionDef(*fused_function));
+
+    // TODO(prazek): we could also remove map functions from library if they
+    // are not used anymore.
+    nodes_to_delete.insert(parent_map_node->name());
+    nodes_to_delete.insert(map_node->name());
+  }
+
+  graph.DeleteNodes(nodes_to_delete);
+  return Status::OK();
+}
+
+void MapFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
+                         const GraphDef& optimize_output, double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MapFusion, "map_fusion");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6a06592b80823458ee6ae3b655aecacbdfbb93b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization fuses map transformations by merging their map functions.
+class MapFusion : public CustomGraphOptimizer {
+ public:
+  MapFusion() = default;
+  ~MapFusion() override = default;
+
+  string name() const override { return "map_fusion"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df6c19dc7c756e9a8d156f52ac5831a851a0ab0a
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion_test.cc
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_fusion.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+NodeDef MakeMapNode(StringPiece name, StringPiece input_node_name) {
+  return test::function::NDef(
+      name, "MapDataset", {input_node_name.ToString()},
+      {{"f", FunctionDefHelper::FunctionRef("XTimesTwo")},
+       {"Targuments", {}},
+       {"output_shapes", {}},
+       {"output_types", {}}});
+}
+
+TEST(MapFusionTest, FuseTwoMapNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeMapNode("map1", "range"), MakeMapNode("map2", "map1")},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MapFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+}
+
+TEST(MapFusionTest, FuseThreeNodesIntoOne) {
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("filename", "Const", {}, {{"value", ""}, {"dtype", DT_STRING}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       MakeMapNode("map1", "range"), MakeMapNode("map2", "map1"),
+       MakeMapNode("map3", "map2"),
+       NDef("cache", "CacheDataset", {"map3", "filename"}, {})},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  MapFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapDataset", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map3", output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 5670966367bd478da7dc675507bfb0c461a16e8c..55d57b3b97dfe5659b584fc1fcbdc22d199acd84 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -64,18 +64,19 @@ bool IsNoOp(const NodeDef& node, const GraphView& graph) {
 Status NoOpElimination::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
     if (!IsNoOp(node, graph)) continue;
 
     GraphView::InputPort input_port = graph.GetInputPort(node.name(), 0);
     NodeDef* const parent = graph.GetRegularFanin(input_port).node;
-    graph_utils::ReplaceInput(node, *parent, &graph);
+    graph.ReplaceInput(node, *parent);
 
     nodes_to_delete.insert(node.name());
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
index 8628b16ea5014669dbf49abef1a281b4adbf243a..a6cc63edba35f0337c9f7f3f6fe43838314d63a5 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination_test.cc
@@ -35,36 +35,32 @@ std::vector<std::pair<string, AttrValue>> GetCommonAttributes() {
   return commonAttributes;
 }
 
-void MakeUnaryNode(GraphDef *graph, const std::string &node_type, int count,
-                   string input_node, NodeDef **return_node) {
-  NodeDef *node_count;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(count, graph, &node_count));
-  TF_ASSERT_OK(graph_utils::AddNode("", node_type,
-                                    {std::move(input_node), node_count->name()},
-                                    GetCommonAttributes(), graph, return_node));
+NodeDef *MakeUnaryNode(const std::string &node_type, int count,
+                       string input_node, MutableGraphView *graph) {
+  NodeDef *node_count = graph_utils::AddScalarConstNode<int64>(count, graph);
+  return graph_utils::AddNode("", node_type,
+                              {std::move(input_node), node_count->name()},
+                              GetCommonAttributes(), graph);
 }
 
-void MakeCacheNode(GraphDef *graph, string input_node, NodeDef **return_node) {
-  NodeDef *node_filename;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<StringPiece>("", graph, &node_filename));
-  TF_ASSERT_OK(graph_utils::AddNode(
-      "", "CacheDataset", {std::move(input_node), node_filename->name()},
-      GetCommonAttributes(), graph, return_node));
+NodeDef *MakeCacheNode(string input_node, MutableGraphView *graph) {
+  NodeDef *node_filename =
+      graph_utils::AddScalarConstNode<StringPiece>("", graph);
+  return graph_utils::AddNode("", "CacheDataset",
+                              {std::move(input_node), node_filename->name()},
+                              GetCommonAttributes(), graph);
 }
 
-void MakeRangeNode(GraphDef *graph, NodeDef **range_node) {
-  NodeDef *start_node, *stop_node, *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+NodeDef *MakeRangeNode(MutableGraphView *graph) {
+  auto *start_node = graph_utils::AddScalarConstNode<int64>(0, graph);
+  auto *stop_node = graph_utils::AddScalarConstNode<int64>(10, graph);
+  auto *step_node = graph_utils::AddScalarConstNode<int64>(1, graph);
 
   std::vector<string> range_inputs = {start_node->name(), stop_node->name(),
                                       step_node->name()};
 
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    GetCommonAttributes(), graph, range_node));
+  return graph_utils::AddNode("", "RangeDataset", range_inputs,
+                              GetCommonAttributes(), graph);
 }
 
 struct NoOpLastEliminationTest
@@ -74,23 +70,22 @@ struct NoOpLastEliminationTest
 // transformations at the end of the pipeline.
 TEST_P(NoOpLastEliminationTest, EliminateLastNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   const std::string &node_type = std::get<0>(GetParam());
   const int node_count = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
-  NodeDef *node;
-  MakeUnaryNode(graph, node_type, node_count, range_node->name(), &node);
+  NodeDef *node =
+      MakeUnaryNode(node_type, node_count, range_node->name(), &graph);
 
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::ContainsNodeWithName(node->name(), output),
+  EXPECT_EQ(graph_utils::ContainsGraphNodeWithName(node->name(), output),
             should_keep_node);
 }
 
@@ -113,30 +108,29 @@ struct NoOpMiddleEliminationTest
 // transformations int the middle of the pipeline.
 TEST_P(NoOpMiddleEliminationTest, EliminateMiddleNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   const std::string &node_type = std::get<0>(GetParam());
   const int node_count = std::get<1>(GetParam());
   const bool should_keep_node = std::get<2>(GetParam());
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
-  NodeDef *node;
-  MakeUnaryNode(graph, node_type, node_count, range_node->name(), &node);
+  NodeDef *node =
+      MakeUnaryNode(node_type, node_count, range_node->name(), &graph);
 
-  NodeDef *cache_node;
-  MakeCacheNode(graph, node->name(), &cache_node);
+  NodeDef *cache_node = MakeCacheNode(node->name(), &graph);
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_EQ(graph_utils::ContainsNodeWithName(node->name(), output),
+  EXPECT_EQ(graph_utils::ContainsGraphNodeWithName(node->name(), output),
             should_keep_node);
-  EXPECT_TRUE(graph_utils::ContainsNodeWithName(cache_node->name(), output));
+  EXPECT_TRUE(
+      graph_utils::ContainsGraphNodeWithName(cache_node->name(), output));
 
-  NodeDef cache_node_out =
-      output.node(graph_utils::FindNodeWithName(cache_node->name(), output));
+  NodeDef cache_node_out = output.node(
+      graph_utils::FindGraphNodeWithName(cache_node->name(), output));
 
   EXPECT_EQ(cache_node_out.input_size(), 2);
   auto last_node_input = (should_keep_node ? node : range_node)->name();
@@ -162,41 +156,40 @@ struct NoOpMultipleEliminationTest : ::testing::TestWithParam<NodesTypes> {};
 // multiple noop nodes.
 TEST_P(NoOpMultipleEliminationTest, EliminateMultipleNoOpNode) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   static_assert(std::tuple_size<NodesTypes>::value == 2,
                 "Make sure to include everything in the test");
   const std::vector<std::pair<string, int>> noop_nodes = {
       std::get<0>(GetParam()), std::get<1>(GetParam())};
 
-  NodeDef *range_node;
-  MakeRangeNode(graph, &range_node);
+  NodeDef *range_node = MakeRangeNode(&graph);
 
   NodeDef *previous = range_node;
   std::vector<string> nodes_to_remove;
   nodes_to_remove.reserve(noop_nodes.size());
 
   for (const auto &noop_node : noop_nodes) {
-    NodeDef *node;
-    MakeUnaryNode(graph, noop_node.first, noop_node.second, previous->name(),
-                  &node);
+    NodeDef *node = MakeUnaryNode(noop_node.first, noop_node.second,
+                                  previous->name(), &graph);
     nodes_to_remove.push_back(node->name());
     previous = node;
   }
 
-  NodeDef *cache_node;
-  MakeCacheNode(graph, previous->name(), &cache_node);
+  NodeDef *cache_node = MakeCacheNode(previous->name(), &graph);
   NoOpElimination optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   for (const auto &noop_node_name : nodes_to_remove)
-    EXPECT_FALSE(graph_utils::ContainsNodeWithName(noop_node_name, output));
+    EXPECT_FALSE(
+        graph_utils::ContainsGraphNodeWithName(noop_node_name, output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithName(cache_node->name(), output));
+  EXPECT_TRUE(
+      graph_utils::ContainsGraphNodeWithName(cache_node->name(), output));
 
-  NodeDef cache_node_out =
-      output.node(graph_utils::FindNodeWithName(cache_node->name(), output));
+  NodeDef cache_node_out = output.node(
+      graph_utils::FindGraphNodeWithName(cache_node->name(), output));
 
   EXPECT_EQ(cache_node_out.input_size(), 2);
   EXPECT_EQ(cache_node_out.input(0), range_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index 8332fb0b1e4ffaa8a5f40b8262c69758fb005ec2..7c7161c5b27de9b6981fc33fe8631a5d040a7265 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
@@ -38,55 +38,62 @@ Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
                                         const GrapplerItem& item,
                                         GraphDef* output) {
   *output = item.graph;
-  GraphView graph(output);
+  MutableGraphView graph(output);
   std::set<string> nodes_to_delete;
-  for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "RepeatDataset") {
-      continue;
-    }
 
-    // Use a more descriptive variable name now that we know the node type.
-    const NodeDef repeat_node(node);
-    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
-    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
-    if (node2->op() != "ShuffleDataset") {
-      continue;
-    }
-
-    NodeDef* new_node = output->add_node();
-    new_node->set_op(kFusedOpName);
-    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
-
-    // Use a more descriptive variable name now that we know the node type.
-    NodeDef* shuffle_node = node2;
+  auto make_shuffle_and_repeat_node = [&output](const NodeDef& shuffle_node,
+                                                const NodeDef& repeat_node) {
+    NodeDef new_node;
+    new_node.set_op(kFusedOpName);
+    graph_utils::SetUniqueGraphNodeName(kFusedOpName, output, &new_node);
 
     // Set the `input` input argument.
-    new_node->add_input(shuffle_node->input(0));
+    new_node.add_input(shuffle_node.input(0));
 
     // Set the `buffer_size` input argument.
-    new_node->add_input(shuffle_node->input(1));
+    new_node.add_input(shuffle_node.input(1));
 
     // Set the `seed` input argument.
-    new_node->add_input(shuffle_node->input(2));
+    new_node.add_input(shuffle_node.input(2));
 
     // Set the `seed2` input argument.
-    new_node->add_input(shuffle_node->input(3));
+    new_node.add_input(shuffle_node.input(3));
 
     // Set the `count` input argument.
-    new_node->add_input(repeat_node.input(1));
+    new_node.add_input(repeat_node.input(1));
 
     // Set `output_types` and `output_shapes` attributes.
     for (auto key : {"output_shapes", "output_types"}) {
-      (*new_node->mutable_attr())[key] = repeat_node.attr().at(key);
+      (*new_node.mutable_attr())[key] = repeat_node.attr().at(key);
     }
+    return new_node;
+  };
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() != "RepeatDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef& repeat_node = node;
+    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
+    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    if (node2->op() != "ShuffleDataset") {
+      continue;
+    }
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef& shuffle_node = *node2;
+
+    NodeDef* shuffle_and_repeat_node =
+        graph.AddNode(make_shuffle_and_repeat_node(shuffle_node, repeat_node));
+    graph.ReplaceInput(repeat_node, *shuffle_and_repeat_node);
 
     // Mark the `Shuffle` and `Repeat` nodes for removal.
-    nodes_to_delete.insert(shuffle_node->name());
+    nodes_to_delete.insert(shuffle_node.name());
     nodes_to_delete.insert(repeat_node.name());
-
-    graph_utils::ReplaceInput(repeat_node, *new_node, &graph);
   }
-  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+
+  graph.DeleteNodes(nodes_to_delete);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
index e89675efb772e970ffc8d3489e5ab1fc25722782..a2e470e511f62c4e6677878c7a6da0122549f985 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -27,7 +27,7 @@ namespace {
 
 TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
@@ -37,52 +37,44 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
   SetAttrValue("output_types", &types_attr);
   common_attrs[1] = std::make_pair("output_types", types_attr);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    common_attrs, graph, &range_node));
-
-  NodeDef *buffer_size_node;
-  TF_ASSERT_OK(
-      graph_utils::AddScalarConstNode<int64>(128, graph, &buffer_size_node));
-  NodeDef *seed_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed_node));
-  NodeDef *seed2_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed2_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
+
+  NodeDef *buffer_size_node =
+      graph_utils::AddScalarConstNode<int64>(128, &graph);
+  NodeDef *seed_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
+  NodeDef *seed2_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> shuffle_inputs(4);
   shuffle_inputs[0] = range_node->name();
   shuffle_inputs[1] = buffer_size_node->name();
   shuffle_inputs[2] = seed_node->name();
   shuffle_inputs[3] = seed2_node->name();
-  NodeDef *shuffle_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "ShuffleDataset", shuffle_inputs,
-                                    common_attrs, graph, &shuffle_node));
+  NodeDef *shuffle_node = graph_utils::AddNode(
+      "", "ShuffleDataset", shuffle_inputs, common_attrs, &graph);
 
-  NodeDef *count_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> repeat_inputs(2);
   repeat_inputs[0] = shuffle_node->name();
   repeat_inputs[1] = count_node->name();
-  NodeDef *repeat_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
-                                    common_attrs, graph, &repeat_node));
+  NodeDef *repeat_node = graph_utils::AddNode(
+      "", "RepeatDataset", repeat_inputs, common_attrs, &graph);
 
   ShuffleAndRepeatFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(shuffle_node->name(), output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithName(repeat_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(shuffle_node->name(), output));
+  EXPECT_FALSE(
+      graph_utils::ContainsGraphNodeWithName(repeat_node->name(), output));
   EXPECT_TRUE(
       graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDataset", output));
   NodeDef shuffle_and_repeat_node = output.node(
@@ -103,7 +95,7 @@ TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
 
 TEST(ShuffleAndRepeatFusionTest, NoChange) {
   GrapplerItem item;
-  GraphDef *graph = &item.graph;
+  MutableGraphView graph(&item.graph);
 
   std::vector<std::pair<string, AttrValue>> common_attrs(2);
   AttrValue shapes_attr;
@@ -113,35 +105,29 @@ TEST(ShuffleAndRepeatFusionTest, NoChange) {
   SetAttrValue("output_types", &types_attr);
   common_attrs[1] = std::make_pair("output_types", types_attr);
 
-  NodeDef *start_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
-  NodeDef *stop_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
-  NodeDef *step_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+  NodeDef *start_node = graph_utils::AddScalarConstNode<int64>(0, &graph);
+  NodeDef *stop_node = graph_utils::AddScalarConstNode<int64>(10, &graph);
+  NodeDef *step_node = graph_utils::AddScalarConstNode<int64>(1, &graph);
 
   std::vector<string> range_inputs(3);
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
-  NodeDef *range_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    common_attrs, graph, &range_node));
+  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                             common_attrs, &graph);
 
-  NodeDef *count_node;
-  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  NodeDef *count_node = graph_utils::AddScalarConstNode<int64>(-1, &graph);
   std::vector<string> repeat_inputs(2);
   repeat_inputs[0] = range_node->name();
   repeat_inputs[1] = count_node->name();
-  NodeDef *repeat_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
-                                    common_attrs, graph, &repeat_node));
+  graph_utils::AddNode("", "RepeatDataset", repeat_inputs, common_attrs,
+                       &graph);
 
   ShuffleAndRepeatFusion optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+  EXPECT_TRUE(graph_utils::Compare(*graph.GetGraph(), output));
 }
 
 }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index fdd82b9603716da6f6c522d11177b3a4f81627b8..bb14ce310dc151d109b1106e82c424f59b9e6cec 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -53,16 +54,6 @@ bool RemoveInput(NodeDef* node, const string& input, NodeMap* node_map) {
   return removed_input;
 }
 
-void DeleteNodes(const std::set<int>& nodes_to_delete, GraphDef* graph) {
-  int last = graph->node_size() - 1;
-  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
-    const int index = *it;
-    graph->mutable_node()->SwapElements(index, last);
-    last--;
-  }
-  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
-}
-
 }  // namespace
 
 bool DependencyOptimizer::SafeToRemoveIdentity(const NodeDef& node) const {
@@ -441,7 +432,7 @@ Status DependencyOptimizer::OptimizeDependencies() {
   if (fetch_nodes_known_) {
     VLOG(1) << "Deleted " << nodes_to_delete.size() << " out of "
             << optimized_graph_->node_size() << " nodes.";
-    DeleteNodes(nodes_to_delete, optimized_graph_);
+    EraseNodesFromGraph(nodes_to_delete, optimized_graph_);
     node_map_.reset(new NodeMap(optimized_graph_));
     BuildNodeToIdx();
   }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 9627ed732346bcc0ce6a98c2a9e66129fcc6f7b8..405778222ac79483e4040791d6983a49ca199e6e 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -616,15 +616,13 @@ Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
     }
   }
 
-  int last = optimized_graph->node_size() - 1;
-  for (int i = optimized_graph->node_size() - 1; i >= 0; --i) {
-    NodeDef* node = optimized_graph->mutable_node(i);
-    if (dead_nodes.find(node) != dead_nodes.end()) {
-      optimized_graph->mutable_node()->SwapElements(i, last);
-      last--;
-    }
+  std::vector<int> nodes_idx_to_delete;
+  nodes_idx_to_delete.reserve(dead_nodes.size());
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    if (dead_nodes.count(&optimized_graph->node(i)))
+      nodes_idx_to_delete.push_back(i);
   }
-  optimized_graph->mutable_node()->DeleteSubrange(last + 1, dead_nodes.size());
+  EraseNodesFromGraph(std::move(nodes_idx_to_delete), optimized_graph);
 
   for (const auto& itr : dead_merge_inputs) {
     NodeDef* dead_node = itr.first;
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index c8e63f95e1855f6593d19d2bc111e468fe2b5eee..153785d3b4770011b516bf84530e662e9a0dc9cb 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/grappler/utils.h"
+
 #include <memory>
+#include <queue>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -21,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -354,13 +356,51 @@ void DedupControlInputs(NodeDef* node) {
 }
 
 namespace {
+
+template <typename UniqueContainer>
+void EraseNodesFromGraphImpl(const UniqueContainer& nodes_to_delete,
+                             GraphDef* graph) {
+  static_assert(std::is_same<typename UniqueContainer::value_type, int>::value,
+                "Need to pass container of ints");
+
+  int last = graph->node_size() - 1;
+  for (auto it = nodes_to_delete.rbegin(); it != nodes_to_delete.rend(); ++it) {
+    const int index = *it;
+    graph->mutable_node()->SwapElements(index, last);
+    last--;
+  }
+  graph->mutable_node()->DeleteSubrange(last + 1, nodes_to_delete.size());
+}
+
 template <typename T>
 inline void STLSortAndRemoveDuplicates(T* v) {
   std::sort(v->begin(), v->end());
   v->erase(std::unique(v->begin(), v->end()), v->end());
 }
+
 }  // namespace
 
+void EraseNodesFromGraph(const std::set<int>& nodes_to_delete,
+                         GraphDef* graph) {
+  EraseNodesFromGraphImpl(nodes_to_delete, graph);
+}
+
+void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph) {
+  STLSortAndRemoveDuplicates(&nodes_to_delete);
+  EraseNodesFromGraphImpl(nodes_to_delete, graph);
+}
+
+void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
+                         GraphDef* graph) {
+  std::vector<int> nodes_idx_to_delete;
+  nodes_idx_to_delete.reserve(nodes_to_delete.size());
+  for (int i = 0; i < graph->node_size(); ++i) {
+    if (nodes_to_delete.count(graph->node(i).name()))
+      nodes_idx_to_delete.push_back(i);
+  }
+  EraseNodesFromGraphImpl(nodes_idx_to_delete, graph);
+}
+
 Status SimpleGraphView::Initialize(
     const GraphDef& graph,
     const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index 1c6fef59eaec8e9b9ae867d01e52382fd758e15e..b297caa8d4ecf866385c8ba12ef9703b9a4fd834 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -209,6 +209,13 @@ void PermuteNodesInPlace(GraphDef* graph, std::vector<int>* permutation,
 
 Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
 
+void EraseNodesFromGraph(const std::set<int>& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(std::vector<int>&& nodes_to_delete, GraphDef* graph);
+
+void EraseNodesFromGraph(const std::set<string>& nodes_to_delete,
+                         GraphDef* graph);
+
 class SimpleGraphView {
  public:
   // Build a graph view for the specified graphdef.
@@ -237,6 +244,9 @@ class SimpleGraphView {
     DCHECK(it != name_to_index_.end());
     return it == name_to_index_.end() ? -1 : it->second;
   }
+  inline const NodeDef& node(int node_idx) const {
+    return graph_->node(node_idx);
+  }
   inline const string& node_name(int node_idx) const {
     return index_to_name_[node_idx];
   }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 49a1996d25e78d17908b1eae04c9acbeb7e2c788..c6e035834cbbf8e9851e521ccaa797ca8f3e2f58 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/notification.h"
@@ -333,7 +335,9 @@ TEST_F(UtilsTest, NumNonControlOutputs) {
   EXPECT_EQ(1, NumNonControlDataOutputs(*add_node, node_map));
 }
 
-TEST_F(UtilsTest, DeleteNodes) {}
+TEST_F(UtilsTest, DeleteNodes) {
+  // TODO(rmlarsen): write forgtten test.
+}
 
 }  // namespace
 }  // namespace grappler
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 10cbcdecc85928c51dabe92c05f0d60ed63d8db7..65a7f8ccf3faf0f0fa4662f7d397b16480ef16ef 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -124,6 +124,7 @@ tf_kernel_library(
         ":bounds_check",
         ":dense_update_functor",
         ":ops_util",
+        ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -557,12 +558,6 @@ cc_header_only_library(
     deps = [":image_resizer_state"],
 )
 
-cc_library(
-    name = "crop_resize_bilinear_core",
-    hdrs = ["crop_resize_bilinear_core.h"],
-    visibility = ["//visibility:private"],
-)
-
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
@@ -2158,7 +2153,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "crop_and_resize_op",
     prefix = "crop_and_resize_op",
-    deps = IMAGE_DEPS + [":crop_resize_bilinear_core"],
+    deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
@@ -2224,7 +2219,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "resize_bilinear_op",
     prefix = "resize_bilinear_op",
-    deps = IMAGE_DEPS + [":crop_resize_bilinear_core"],
+    deps = IMAGE_DEPS,
 )
 
 tf_kernel_library(
@@ -4864,7 +4859,6 @@ filegroup(
         "concat_op.cc",
         "constant_op.cc",
         "constant_op.h",
-        "crop_resize_bilinear_core.h",
         "cwise_ops.h",
         "cwise_ops_common.cc",
         "cwise_ops_common.h",
@@ -5232,6 +5226,16 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+ANDROID_TEXTUAL_HDRS = [
+    "gather_nd_op_cpu_impl.h",
+    "gemm_functors.h",
+    "mirror_pad_op_cpu_impl.h",
+    "scatter_nd_op_cpu_impl.h",
+    "slice_op_cpu_impl.h",
+    "strided_slice_op_impl.h",
+    "tile_ops_cpu_impl.h",
+]
+
 # A file group which contains nearly all available operators which
 # may work on Android. This is intended to be used with selective
 # registration.
@@ -5293,10 +5297,20 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
-        ],
+            # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
+            "mkl_*",
+            "xsmm_*",
+            "cwise_ops_sycl_common.h",
+        ] + ANDROID_TEXTUAL_HDRS,
     ),
     visibility = ["//visibility:public"],
 )
+
+filegroup(
+    name = "android_all_ops_textual_hdrs",
+    srcs = ANDROID_TEXTUAL_HDRS,
+    visibility = ["//visibility:public"],
+)
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
 
 cc_library(
diff --git a/tensorflow/core/kernels/argmax_op.cc b/tensorflow/core/kernels/argmax_op.cc
index 49cd997fed544c221a2cd32598b050a02d271f86..c731b64993b3a6cebfb46eca9221ca28b729e845 100644
--- a/tensorflow/core/kernels/argmax_op.cc
+++ b/tensorflow/core/kernels/argmax_op.cc
@@ -59,7 +59,7 @@ class ArgOp : public OpKernel {
 
     int axis = dim < 0 ? dim + input_dims : dim;
 
-    OP_REQUIRES(context, axis >= 0 && axis < input_dims,
+    OP_REQUIRES(context, FastBoundsCheck(axis, input_dims),
                 errors::InvalidArgument("Expected dimension in the range [",
                                         -input_dims, ", ", input_dims,
                                         "), but got ", dim));
@@ -76,6 +76,10 @@ class ArgOp : public OpKernel {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
 #define HANDLE_DIM(NDIM)                                        \
   case NDIM:                                                    \
     ArgFunctor::Reduce##NDIM(context->eigen_device<Device>(),   \
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index b77c14d01284319e8d05d67ce1b6d1b94f4c394b..656b6ced6de00933cfe8db7dadd1a56ade212758 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -147,13 +147,21 @@ class AdaptiveSharedBatchScheduler
 
   // Tracks processing latency and adjusts in_flight_batches_limit to minimize.
   void CallbackWrapper(const internal::ASBSBatch<TaskType>* batch,
-                       BatchProcessor callback);
+                       BatchProcessor callback, bool is_express);
 
   // Schedules batch if in_flight_batches_limit_ is not met.
   void MaybeScheduleNextBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Schedules the earliest closed batch in batches_
+  // if batch_thread_pool_ has an idle thead.
+  // Batches scheduled this way are called express batches.
+  // Express batches are not limited by in_flight_batches_limit_, and
+  // their latencies will not affect in_flight_batches_limit_.
+  void MaybeScheduleClosedBatch() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch,
+                bool also_schedule_closed_batch);
 
   // Removes queue from scheduler.
   void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
@@ -180,8 +188,10 @@ class AdaptiveSharedBatchScheduler
   // results in an actual cap of 3 80% of the time, and 4 20% of the time.
   double in_flight_batches_limit_ GUARDED_BY(mu_);
 
-  // Number of batches currently being processed.
+  // Number of regular batches currently being processed.
   int64 in_flight_batches_ GUARDED_BY(mu_) = 0;
+  // Number of express batches currently being processed.
+  int64 in_flight_express_batches_ GUARDED_BY(mu_) = 0;
 
   // RNG engine and distribution.
   std::default_random_engine rand_engine_;
@@ -363,10 +373,14 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
 
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    const internal::ASBSBatch<TaskType>* batch) {
+    const internal::ASBSBatch<TaskType>* batch,
+    bool also_schedule_closed_batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
   MaybeScheduleNextBatch();
+  if (also_schedule_closed_batch) {
+    MaybeScheduleClosedBatch();
+  }
 }
 
 template <typename TaskType>
@@ -407,19 +421,45 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
   batch->queue()->ReleaseBatch(batch);
   batch_thread_pool_->Schedule(
       std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper, this,
-                batch, queues_and_callbacks_[batch->queue()]));
+                batch, queues_and_callbacks_[batch->queue()], false));
   in_flight_batches_++;
 }
 
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatch() {
+  if (in_flight_batches_ + in_flight_express_batches_ >=
+      options_.num_batch_threads) {
+    return;
+  }
+  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+    if ((*it)->IsClosed()) {
+      const internal::ASBSBatch<TaskType>* batch = *it;
+      batches_.erase(it);
+      batch->queue()->ReleaseBatch(batch);
+      batch_thread_pool_->Schedule(
+          std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
+                    this, batch, queues_and_callbacks_[batch->queue()], true));
+      in_flight_express_batches_++;
+      return;
+    }
+  }
+}
+
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
     const internal::ASBSBatch<TaskType>* batch,
-    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback) {
+    AdaptiveSharedBatchScheduler<TaskType>::BatchProcessor callback,
+    bool is_express) {
   int64 start_time = batch->creation_time_micros();
   callback(std::unique_ptr<Batch<TaskType>>(
       const_cast<internal::ASBSBatch<TaskType>*>(batch)));
   int64 end_time = GetEnv()->NowMicros();
   mutex_lock l(mu_);
+  if (is_express) {
+    in_flight_express_batches_--;
+    MaybeScheduleClosedBatch();
+    return;
+  }
   in_flight_batches_--;
   batch_count_++;
   batch_latency_sum_ += end_time - start_time;
@@ -496,6 +536,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
                                    " is larger than maximum batch size ",
                                    options_.max_batch_size);
   }
+  bool is_old_batch_closed = false;
   {
     mutex_lock l(mu_);
     // Current batch is full, create another if allowed.
@@ -505,6 +546,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
         return errors::Unavailable("The batch scheduling queue is full");
       }
       current_batch_->Close();
+      is_old_batch_closed = true;
       current_batch_ = nullptr;
     }
     if (!current_batch_) {
@@ -516,7 +558,8 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
     num_enqueued_tasks_++;
   }
   // AddBatch must be called outside of lock, since it may call ReleaseBatch.
-  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  if (new_batch != nullptr)
+    scheduler_->AddBatch(new_batch, is_old_batch_closed);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index e6e388b3d107a51a500401d24261020ce874cf25..0478c9328056dfa5a3a5a6438d687e3acfc65763 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -55,8 +55,41 @@ typedef Eigen::SyclDevice SYCLDevice;
   FN(arg0, std::complex<double>)
 
 CastOpBase::CastOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &src_dtype_));
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &dst_dtype_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("SrcT", &external_src_dtype_));
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("DstT", &external_dst_dtype_));
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("Truncate", &use_truncation_));
+
+  // Quantized data types use the same underlying format as their non quantized
+  // version so we use the non quantized implementation for casting.
+  if (external_dst_dtype_ == DT_QUINT8) {
+    dst_dtype_ = DT_UINT8;
+  } else if (external_dst_dtype_ == DT_QINT8) {
+    dst_dtype_ = DT_INT8;
+  } else if (external_dst_dtype_ == DT_QINT32) {
+    dst_dtype_ = DT_INT32;
+  } else if (external_dst_dtype_ == DT_QINT16) {
+    dst_dtype_ = DT_INT16;
+  } else if (external_dst_dtype_ == DT_QUINT16) {
+    dst_dtype_ = DT_UINT16;
+  } else {
+    dst_dtype_ = external_dst_dtype_;
+  }
+
+  if (external_src_dtype_ == DT_QUINT8) {
+    src_dtype_ = DT_UINT8;
+  } else if (external_src_dtype_ == DT_QINT8) {
+    src_dtype_ = DT_INT8;
+  } else if (external_src_dtype_ == DT_QINT32) {
+    src_dtype_ = DT_INT32;
+  } else if (external_src_dtype_ == DT_QINT16) {
+    src_dtype_ = DT_INT16;
+  } else if (external_src_dtype_ == DT_QUINT16) {
+    src_dtype_ = DT_UINT16;
+  } else {
+    src_dtype_ = external_src_dtype_;
+  }
 }
 
 void CastOpBase::Compute(OpKernelContext* ctx) {
@@ -64,15 +97,20 @@ void CastOpBase::Compute(OpKernelContext* ctx) {
   if (work_ == nullptr) {
     ctx->set_output(0, inp);
   } else {
+    Tensor in;
+    in.UnsafeCopyFromInternal(inp, src_dtype_, inp.shape());
     Tensor* out = nullptr;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, inp.shape(), &out));
-    work_(ctx, inp, out);
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in.shape(), &out));
+    out->set_dtype(dst_dtype_);
+    work_(ctx, in, out, use_truncation_);
+    out->set_dtype(external_dst_dtype_);
   }
 }
 
 Status CastOpBase::Unimplemented() {
-  return errors::Unimplemented("Cast ", DataTypeString(src_dtype_), " to ",
-                               DataTypeString(dst_dtype_), " is not supported");
+  return errors::Unimplemented("Cast ", DataTypeString(external_src_dtype_),
+                               " to ", DataTypeString(external_dst_dtype_),
+                               " is not supported");
 }
 
 CpuCastOp::CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
@@ -80,7 +118,7 @@ CpuCastOp::CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
 }
 
 Status CpuCastOp::Prepare() {
-  if (src_dtype_ == dst_dtype_) {
+  if (external_src_dtype_ == external_dst_dtype_) {
     work_ = nullptr;  // Identity
     return Status::OK();
   }
@@ -133,7 +171,7 @@ class GpuCastOp : public CastOpBase {
 
  private:
   Status Prepare() {
-    if (src_dtype_ == dst_dtype_) {
+    if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
@@ -215,7 +253,7 @@ class SyclCastOp : public CastOpBase {
 
  private:
   Status Prepare() {
-    if (src_dtype_ == dst_dtype_) {
+    if (external_src_dtype_ == external_dst_dtype_) {
       work_ = nullptr;  // Identity
       return Status::OK();
     }
diff --git a/tensorflow/core/kernels/cast_op.h b/tensorflow/core/kernels/cast_op.h
index 16d2e0e0a56d1f2f45a9394979b8cdcec1391154..527ab528c9e2ec368ea486431f20b00076cb7109 100644
--- a/tensorflow/core/kernels/cast_op.h
+++ b/tensorflow/core/kernels/cast_op.h
@@ -24,8 +24,71 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/types.h"
 
+// Note that the GPU cast functor templates need to be instantiated unlike the
+// CPU ones, and hence their specializations are different than that for CPUs.
+#ifdef SPECIALIZE_FOR_GPUS
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+  template <typename Device>                                        \
+  struct CastFunctor<Device, OUT_TYPE, IN_OUT> {                    \
+    void operator()(const Device& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };                                                                \
+  template struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT>;
+#else
+#define SPECIALIZE_CAST(DEVICE, OUT_TYPE, IN_OUT)                   \
+  template <>                                                       \
+  struct CastFunctor<DEVICE, OUT_TYPE, IN_OUT> {                    \
+    void operator()(const DEVICE& d,                                \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,     \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,   \
+                    bool truncate = false) {                        \
+      if (truncate) {                                               \
+        out_tensor.device(d) =                                      \
+            in_tensor.unaryExpr(LSBZeroSetter<IN_OUT, OUT_TYPE>())  \
+                .template cast<OUT_TYPE>();                         \
+      } else {                                                      \
+        out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>(); \
+      }                                                             \
+    }                                                               \
+  };
+#endif
+
+#define CAST_FUNCTORS(devname)                                        \
+  SPECIALIZE_CAST(devname, float, double)                             \
+  SPECIALIZE_CAST(devname, float, std::complex<double>)               \
+  SPECIALIZE_CAST(devname, std::complex<float>, std::complex<double>) \
+  SPECIALIZE_CAST(devname, std::complex<float>, double)               \
+  SPECIALIZE_CAST(devname, Eigen::half, double)                       \
+  SPECIALIZE_CAST(devname, Eigen::half, float)                        \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<double>)         \
+  SPECIALIZE_CAST(devname, Eigen::half, std::complex<float>)          \
+  SPECIALIZE_CAST(devname, bfloat16, float)                           \
+  template <typename OUT_TYPE, typename IN_OUT>                       \
+  struct CastFunctor<devname, OUT_TYPE, IN_OUT> {                     \
+    void operator()(const devname& d,                                 \
+                    typename TTypes<OUT_TYPE>::Flat out_tensor,       \
+                    typename TTypes<IN_OUT>::ConstFlat in_tensor,     \
+                    bool truncate = false) {                          \
+      out_tensor.device(d) = in_tensor.template cast<OUT_TYPE>();     \
+    }                                                                 \
+  };
+
 namespace tensorflow {
 
+typedef std::function<void(OpKernelContext*, const Tensor&, Tensor*,
+                           bool trunc)>
+    CastFunctorType;
+
 // Common base class of Cast kernels
 class CastOpBase : public OpKernel {
  public:
@@ -36,8 +99,10 @@ class CastOpBase : public OpKernel {
  protected:
   DataType src_dtype_;
   DataType dst_dtype_;
-  std::function<void(OpKernelContext*, const Tensor&, Tensor*)> work_ = nullptr;
-
+  DataType external_src_dtype_;
+  DataType external_dst_dtype_;
+  bool use_truncation_;
+  CastFunctorType work_ = nullptr;
   Status Unimplemented();
 
   TF_DISALLOW_COPY_AND_ASSIGN(CastOpBase);
@@ -54,6 +119,23 @@ class CpuCastOp : public CastOpBase {
 
 namespace functor {
 
+template <typename I>
+constexpr int MantissaWidth() {
+  return std::numeric_limits<I>::digits;
+}
+
+template <>
+constexpr int MantissaWidth<Eigen::half>() {
+  // Remember, there's 1 hidden bit
+  return 10 + 1;
+}
+
+template <>
+constexpr int MantissaWidth<bfloat16>() {
+  // Remember, there's 1 hidden bit
+  return 7 + 1;
+}
+
 template <typename Device, typename Tout, typename Tin>
 void Cast(const Device& d, typename TTypes<Tout>::Flat o,
           typename TTypes<Tin>::ConstFlat i) {
@@ -63,7 +145,85 @@ void Cast(const Device& d, typename TTypes<Tout>::Flat o,
 template <typename Device, typename Tout, typename Tin>
 struct CastFunctor {
   void operator()(const Device& d, typename TTypes<Tout>::Flat o,
-                  typename TTypes<Tin>::ConstFlat i);
+                  typename TTypes<Tin>::ConstFlat i, bool truncate = false);
+};
+
+// Only enable LSBZeroSetterHelper for 64 and 32 bit input data types.
+// Specialize for others if needed in future.
+template <typename I>
+typename std::enable_if<sizeof(I) == 8, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!std::isnan(t)) {
+    uint64_t* p = reinterpret_cast<uint64_t*>(&t);
+    *p &= (0xFFFFFFFFFFFFFFFF << n);
+  }
+}
+
+template <typename I>
+typename std::enable_if<sizeof(I) == 4, void>::type EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE static LSBZeroSetterHelper(I& t, int n) {
+  // Only zero the bits for non-NaNs.
+  // For NaNs, let the non-truncation version handle it.
+  if (!std::isnan(t)) {
+    uint32_t* p = reinterpret_cast<uint32_t*>(&t);
+    *p &= (0xFFFFFFFF << n);
+  }
+}
+
+// Set n least significant bits to 0
+template <typename I, typename O>
+struct LSBZeroSetter {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const I operator()(const I& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I t = a;
+    LSBZeroSetterHelper(t, bits);
+    return t;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, std::complex<O>> {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = std::real(a);
+    I img = std::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
+};
+
+template <typename I, typename O>
+struct LSBZeroSetter<std::complex<I>, O> {
+  EIGEN_EMPTY_STRUCT_CTOR(LSBZeroSetter)
+  // Sets the 16 LSBits of the float to 0
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::complex<I> operator()(
+      const std::complex<I>& a) const {
+    constexpr int bits = MantissaWidth<I>() - MantissaWidth<O>();
+    static_assert(
+        bits > 0,
+        "The output type must have fewer mantissa bits than the input type\n");
+    I re = std::real(a);
+    I img = std::imag(a);
+    LSBZeroSetterHelper(re, bits);
+    LSBZeroSetterHelper(img, bits);
+    std::complex<I> toReturn(re, img);
+    return toReturn;
+  }
 };
 
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/cast_op_gpu.cu.cc b/tensorflow/core/kernels/cast_op_gpu.cu.cc
index 607e7f5efdef8694cc2f8f9d0e2c791a104f0eab..036996fca2725c009ba0b0c6799df7cd8b6b0871 100644
--- a/tensorflow/core/kernels/cast_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/cast_op_gpu.cu.cc
@@ -18,22 +18,19 @@ limitations under the License.
 #define EIGEN_USE_GPU
 
 #include "tensorflow/core/framework/bfloat16.h"
+#define SPECIALIZE_FOR_GPUS
 #include "tensorflow/core/kernels/cast_op.h"
+#undef SPECIALIZE_FOR_GPUS
 
 namespace tensorflow {
 namespace functor {
 
 typedef Eigen::GpuDevice GPUDevice;
 
-template <typename O, typename I>
-struct CastFunctor<GPUDevice, O, I> {
-  void operator()(const GPUDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    Cast<GPUDevice, O, I>(d, o, i);
-  }
-};
+CAST_FUNCTORS(GPUDevice);
 
 #define DEFINE(O, I) template struct CastFunctor<GPUDevice, O, I>
+
 #define DEFINE_ALL_FROM(in_type)        \
   DEFINE(in_type, bool);                \
   DEFINE(in_type, uint8);               \
@@ -59,14 +56,43 @@ DEFINE_ALL_FROM(int8);
 DEFINE_ALL_FROM(int16);
 DEFINE_ALL_FROM(int32);
 DEFINE_ALL_FROM(int64);
-DEFINE_ALL_FROM(Eigen::half);
-DEFINE_ALL_FROM(float);
 DEFINE_ALL_FROM(double);
-DEFINE_ALL_FROM(std::complex<float>);
 DEFINE_ALL_FROM(std::complex<double>);
-DEFINE(bfloat16, float);
 DEFINE(float, bfloat16);
 
+#define DEFINE_ALL_TO_FLOAT(out_type) \
+  DEFINE(out_type, bool);             \
+  DEFINE(out_type, uint8);            \
+  DEFINE(out_type, uint16);           \
+  DEFINE(out_type, uint32);           \
+  DEFINE(out_type, uint64);           \
+  DEFINE(out_type, int8);             \
+  DEFINE(out_type, int16);            \
+  DEFINE(out_type, int32);            \
+  DEFINE(out_type, int64);            \
+  DEFINE(out_type, Eigen::half);      \
+  DEFINE(out_type, float);            \
+  DEFINE(out_type, std::complex<float>)
+
+#define DEFINE_ALL_TO_HALF(out_type) \
+  DEFINE(out_type, bool);            \
+  DEFINE(out_type, uint8);           \
+  DEFINE(out_type, uint16);          \
+  DEFINE(out_type, uint32);          \
+  DEFINE(out_type, uint64);          \
+  DEFINE(out_type, int8);            \
+  DEFINE(out_type, int16);           \
+  DEFINE(out_type, int32);           \
+  DEFINE(out_type, int64);           \
+  DEFINE(out_type, Eigen::half)
+
+DEFINE_ALL_TO_HALF(Eigen::half);
+DEFINE_ALL_TO_HALF(bfloat16);
+DEFINE_ALL_TO_FLOAT(float);
+DEFINE_ALL_TO_FLOAT(std::complex<float>);
+
+#undef DEFINE_ALL_TO_FLOAT
+#undef DEFINE_ALL_TO_HALF
 #undef DEFINE_ALL_FROM
 #undef DEFINE
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index fe821b25df52ae8e5c13e5681b6e6d76d5bbdcab..b899bac681f654a1d7523eee84d3457d2a25417b 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -25,22 +25,10 @@ namespace tensorflow {
 
 namespace functor {
 
-template <typename O, typename I>
-struct CastFunctor<Eigen::ThreadPoolDevice, O, I> {
-  void operator()(const Eigen::ThreadPoolDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    o.device(d) = i.template cast<O>();
-  }
-};
+CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
 #ifdef TENSORFLOW_USE_SYCL
-template <typename O, typename I>
-struct CastFunctor<Eigen::SyclDevice, O, I> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<O>::Flat o,
-                  typename TTypes<I>::ConstFlat i) {
-    o.device(d) = i.template cast<O>();
-  }
-};
+CAST_FUNCTORS(Eigen::SyclDevice);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
@@ -68,139 +56,103 @@ struct CastFunctor<Eigen::SyclDevice, O, I> {
   CURRY_TYPES3_NO_BF16(FN, arg0, arg1) \
   FN(arg0, arg1, bfloat16);
 
-#define CAST_CASE(DEVICE, IN, OUT)                                         \
-  if (DataTypeToEnum<OUT>::value == dst_dtype) {                           \
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {      \
-      functor::CastFunctor<DEVICE, OUT, IN> func;                          \
-      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>()); \
-    };                                                                     \
+#define CAST_CASE(DEVICE, IN, OUT)                                        \
+  if (DataTypeToEnum<OUT>::value == dst_dtype) {                          \
+    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,       \
+              bool truncate) {                                            \
+      functor::CastFunctor<DEVICE, OUT, IN> func;                         \
+      func(ctx->eigen_device<DEVICE>(), out->flat<OUT>(), inp.flat<IN>(), \
+           truncate);                                                     \
+    };                                                                    \
   }
 
 // The functions below are implemented in the cast_op_impl_*.cc files.
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBool(DataType dst_dtype);
+CastFunctorType GetCpuCastFromBool(DataType dst_dtype);
+
+CastFunctorType GetCpuCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint8(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint16(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint32(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt8(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt16(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt32(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromHalf(DataType dst_dtype);
+CastFunctorType GetCpuCastFromHalf(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromFloat(DataType dst_dtype);
+CastFunctorType GetCpuCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromDouble(DataType dst_dtype);
+CastFunctorType GetCpuCastFromDouble(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex64(DataType dst_dtype);
+CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex128(DataType dst_dtype);
+CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBfloat(DataType dst_dtype);
+CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype);
 
 #if GOOGLE_CUDA
 // Same, for GPU.
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBool(DataType dst_dtype);
+CastFunctorType GetGpuCastFromBool(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint8(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint16(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint32(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt8(DataType dst_dtype);
+CastFunctorType GetGpuCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt16(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt32(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromHalf(DataType dst_dtype);
+CastFunctorType GetGpuCastFromHalf(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromFloat(DataType dst_dtype);
+CastFunctorType GetGpuCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromDouble(DataType dst_dtype);
+CastFunctorType GetGpuCastFromDouble(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex64(DataType dst_dtype);
+CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex128(DataType dst_dtype);
+CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBfloat(DataType dst_dtype);
+CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA
 
 #ifdef TENSORFLOW_USE_SYCL
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromBool(DataType dst_dtype);
+CastFunctorType GetSyclCastFromBool(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint8(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint8(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint16(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint32(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint64(DataType dst_dtype);
+CastFunctorType GetSyclCastFromUint64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt16(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt16(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt32(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt32(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt64(DataType dst_dtype);
+CastFunctorType GetSyclCastFromInt64(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromFloat(DataType dst_dtype);
+CastFunctorType GetSyclCastFromFloat(DataType dst_dtype);
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromDouble(DataType dst_dtype);
+CastFunctorType GetSyclCastFromDouble(DataType dst_dtype);
 #endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_bfloat.cc b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
index bfa7ba0d4770e7a4ac1493482d90b166a4fcd3a2..96aae1560805bd4d65503f2dceae870f94dcc1b6 100644
--- a/tensorflow/core/kernels/cast_op_impl_bfloat.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bfloat.cc
@@ -22,20 +22,19 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBfloat(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromBfloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, bfloat16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBfloat(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype) {
   if (dst_dtype == DT_FLOAT) {
-    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
+    return [](OpKernelContext* ctx, const Tensor& inp, Tensor* out,
+              bool truncate) {
       functor::CastFunctor<GPUDevice, float, bfloat16> func;
       func(ctx->eigen_device<GPUDevice>(), out->flat<float>(),
-           inp.flat<bfloat16>());
+           inp.flat<bfloat16>(), truncate);
     };
   }
   return nullptr;
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index c5c7394b43c92069aec4a46c9a712da1f606f6a8..792d4781f22b63884357feb0570a1a99228a9ea7 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromBool(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, bool);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromBool(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, bool);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromBool(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromBool(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromBool(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, bool);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex128.cc b/tensorflow/core/kernels/cast_op_impl_complex128.cc
index 52899d58cdcff2df7fca07d223cc060ba080be82..9a184e5954a0cca6ddc7f3621458a58daa6627c7 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex128.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex128.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex128(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromComplex128(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, std::complex<double>);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex128(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromComplex128(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<double>);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_complex64.cc b/tensorflow/core/kernels/cast_op_impl_complex64.cc
index 617bda53d5822f67186088c0251caf9c108e6a7d..77bc620b46031674f2a929cacf981652297a1cb5 100644
--- a/tensorflow/core/kernels/cast_op_impl_complex64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_complex64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromComplex64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromComplex64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, std::complex<float>);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromComplex64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromComplex64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, std::complex<float>);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 7dc485ddad275d6fcdcc54506fabce2a90819645..ff9056897f8a71777c8f37fba53b9a57943b9c2f 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, double);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, double);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromDouble(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromDouble(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromDouble(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, double);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index 1c933914fde14987562b1d796ebf6621d7980b28..f1e8f0e37b964f42bca9bc27b6547d01dbb23719 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -22,15 +22,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, float);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, GPUDevice, float);
   return nullptr;
 }
@@ -38,8 +36,7 @@ GetGpuCastFromFloat(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromFloat(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromFloat(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, float);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_half.cc b/tensorflow/core/kernels/cast_op_impl_half.cc
index ef4b94e3263054f46bbe5b7c5487cc7b30990995..5da3a013528c3a071bfe852d82abfd331db2fb36 100644
--- a/tensorflow/core/kernels/cast_op_impl_half.cc
+++ b/tensorflow/core/kernels/cast_op_impl_half.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromHalf(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, Eigen::half);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromHalf(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromHalf(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, Eigen::half);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index 59360f744573803f44cf7c31d5acf836e580fdc4..440ee88fb510073cfd9135d1276ca965c6094416 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int16);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt16(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt16(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt16(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int16);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index a867392fde1c4aa45960bd5a490c2664c0ba0005..4b3e7efddc1976ee615dfca345dd65c30db74b49 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int32);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int32);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt32(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt32(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt32(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int32);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 467a8f6c89b35ea1f1c8da1327f6d204b7e876b0..0f711aa560233ecb217953799f8c6c2f243748db 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int64);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int64);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt64(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt64(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt64(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int64);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 21002a4321be4474fdd6f60e61afce539d2ea177..eac185d5a07cdb1db02d9d72e125ee25928cc04e 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, int8);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, int8);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromInt8(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromInt8(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromInt8(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int8);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index cd829bae2a90af6daecf1a6f67be96cdb1854140..3aebbdc1f37a3b32f5011af9b9407debca8253e9 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint16);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint16);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint16(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint16(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint16(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint16);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index d1a854d98be23fb5ba7b4f58a6379a51a4ec35db..86f5961bcc7411539fbba62fb41c41e7f5aaf35e 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint32);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint32);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint32(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint32(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint32(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint32);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index 604e0424fcb61fc4c69892713152c52415a60741..6478c266ee997e7cc47100b7a4af05888a46cfdf 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint64);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint64);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint64(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint64(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint64(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint64);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 2d1a6f3a4edc72bfea53a2b95113d32e5b76c913..b22547a23ec693ee073c298374e17aec098fbd83 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -20,15 +20,13 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetCpuCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetCpuCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3(CAST_CASE, CPUDevice, uint8);
   return nullptr;
 }
 
 #if GOOGLE_CUDA
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetGpuCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3_NO_BF16(CAST_CASE, GPUDevice, uint8);
   return nullptr;
 }
@@ -36,8 +34,7 @@ GetGpuCastFromUint8(DataType dst_dtype) {
 
 #ifdef TENSORFLOW_USE_SYCL
 typedef Eigen::SyclDevice SYCLDevice;
-std::function<void(OpKernelContext*, const Tensor&, Tensor*)>
-GetSyclCastFromUint8(DataType dst_dtype) {
+CastFunctorType GetSyclCastFromUint8(DataType dst_dtype) {
   CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint8);
   return nullptr;
 }
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index 75e21802c05b14f1689cb4ed3061f4279265eabc..cb305de5e3c9929254785899686abbed3c8376bf 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -40,17 +40,27 @@ static Graph* Cast(int num) {
 
 class CastOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType src, DataType dst) {
-    TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
-                     .Input(FakeInput(src))
-                     .Attr("SrcT", src)
-                     .Attr("DstT", dst)
-                     .Finalize(node_def()));
+  void MakeOp(DataType src, DataType dst, bool trunc = false) {
+    if (trunc) {
+      TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+                       .Input(FakeInput(src))
+                       .Attr("SrcT", src)
+                       .Attr("DstT", dst)
+                       .Attr("Truncate", true)
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(NodeDefBuilder("cast_op", "Cast")
+                       .Input(FakeInput(src))
+                       .Attr("SrcT", src)
+                       .Attr("DstT", dst)
+                       .Finalize(node_def()));
+    }
+
     TF_EXPECT_OK(InitOp());
   }
 
   template <typename INPUT, typename OUTPUT>
-  void CheckCast() {
+  void CheckCast(bool trunc = false) {
     DataType in_type = DataTypeToEnum<INPUT>::v();
     DataType out_type = DataTypeToEnum<OUTPUT>::v();
     MakeOp(in_type, out_type);
@@ -64,8 +74,9 @@ class CastOpTest : public OpsTestBase {
   }
 };
 
-#define TEST_CAST(in, out) \
-  TEST_F(CastOpTest, TestCast##_##in##_##out) { CheckCast<in, out>(); }
+#define TEST_CAST(in, out)                                              \
+  TEST_F(CastOpTest, TestCast##_##in##_##out) { CheckCast<in, out>(); } \
+  TEST_F(CastOpTest, TestCast2##_##in##_##out) { CheckCast<in, out>(true); }
 
 #define TEST_ALL_CASTS_FROM(in) \
   TEST_CAST(in, uint8);         \
@@ -78,7 +89,12 @@ class CastOpTest : public OpsTestBase {
   TEST_CAST(in, half);          \
   TEST_CAST(in, float);         \
   TEST_CAST(in, double);        \
-  TEST_CAST(in, bfloat16);
+  TEST_CAST(in, bfloat16);      \
+  TEST_CAST(in, quint8);        \
+  TEST_CAST(in, qint8);         \
+  TEST_CAST(in, qint32);        \
+  TEST_CAST(in, qint16);        \
+  TEST_CAST(in, quint16);
 
 TEST_ALL_CASTS_FROM(uint8)
 TEST_ALL_CASTS_FROM(uint16)
@@ -91,6 +107,11 @@ TEST_ALL_CASTS_FROM(half)
 TEST_ALL_CASTS_FROM(float)
 TEST_ALL_CASTS_FROM(double)
 TEST_ALL_CASTS_FROM(bfloat16)
+TEST_ALL_CASTS_FROM(quint8)
+TEST_ALL_CASTS_FROM(qint8)
+TEST_ALL_CASTS_FROM(qint32)
+TEST_ALL_CASTS_FROM(qint16)
+TEST_ALL_CASTS_FROM(quint16)
 
 #undef TEST_ALL_CASTS_FROM
 #undef TEST_CAST
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index aca75176a565dd5b76793b143eab19f5ffc5aad6..63b1bcda439e9aa931122bbd4ef3d47d94bfbd7c 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -909,6 +909,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
         input_desc.width()}},              // in_cols
+      FORMAT_NCHW,                         // compute_data_format
       dims.out_depth,                      // out_depths
       {{dims.spatial_dims[0].filter_size,  // filter_rows
         dims.spatial_dims[1].filter_size,  // filter_cols
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 63a775afa8bd69d5c9b97b5545086dface4312ac..d664a11e73c0264d31e302e4fc2b321855ddc526 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -957,6 +957,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       dims.in_depth,                       // in_depths
       {{input_desc.height(),               // in_rows
         input_desc.width()}},              // in_cols
+      FORMAT_NCHW,                         // compute_data_format
       dims.out_depth,                      // out_depths
       {{dims.spatial_dims[0].filter_size,  // filter_rows
         dims.spatial_dims[1].filter_size,  // filter_cols
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 980b1063de9997a05304c719857a3ea82f40e650..15f1bf9abaec9b35551f3de48e82021e127b3aa7 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -716,6 +716,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
         batch,
         in_depth,
         {{input_size[0], input_size[1], input_size[2]}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
         {{dilations[0], dilations[1], dilations[2]}},
@@ -1112,6 +1113,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         batch,
         in_depth,
         {{input_size[0], input_size[1], input_size[2]}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_size[0], filter_size[1], filter_size[2]}},
         {{dilations[0], dilations[1], dilations[2]}},
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 3b9886eece9ec7d1a931010416ac21889ca6b358..ef692418d6919e10eb9d5b08006597cd128bfe91 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -713,6 +713,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       in_depths,         // in_depths
       {{in_rows,         // in_rows
         in_cols}},       // in_cols
+      FORMAT_NCHW,       // compute_data_format
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
         patch_cols,      // filter_cols
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 9ec16be67d8c0d6ea228898c14dbb575dd78cbe9..a1eed4e68c919a7c2e73aa5a782a6c2a7dc8b702 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -415,6 +415,7 @@ struct LaunchConvOp<GPUDevice, T> {
         in_batch,
         in_depth,
         {{in_planes, in_rows, in_cols}},
+        FORMAT_NCHW,
         out_depth,
         {{filter_planes, filter_rows, filter_cols}},
         {{dilations[0], dilations[1], dilations[2]}},
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index d2c8020bb629c7c8ccbc3e06bdbe4f6af25833f2..afc611f27741c784bfc1b513f8033a6023634d76 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -85,13 +85,15 @@ class ConvParameters {
  public:
   using SpatialArray = gtl::InlinedVector<int64, 3>;
   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
-                 int64 out_depths, const SpatialArray& filter,
-                 const SpatialArray& dilation, const SpatialArray& stride,
-                 const SpatialArray& padding, DataType dtype, int device_id)
+                 TensorFormat data_format, int64 out_depths,
+                 const SpatialArray& filter, const SpatialArray& dilation,
+                 const SpatialArray& stride, const SpatialArray& padding,
+                 DataType dtype, int device_id)
       : batch_(batch),
         in_depths_(in_depths),
         out_depths_(out_depths),
         in_(in),
+        data_format_(data_format),
         filter_(filter),
         dilation_(dilation),
         stride_(stride),
@@ -101,6 +103,7 @@ class ConvParameters {
     hash_code_ = batch;
     hash_code_ = Hash64Combine(hash_code_, in_depths);
     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
+    hash_code_ = Hash64Combine(hash_code_, data_format);
     hash_code_ = Hash64Combine(hash_code_, out_depths);
     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
     for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
@@ -123,6 +126,7 @@ class ConvParameters {
     return strings::StrCat(
         batch_, ", ", in_depths_, ", ",
         "(", str_util::Join(in_, ", "), "), ",
+        ::tensorflow::ToString(data_format_), ", ",
         out_depths_, ", ",
         "(", str_util::Join(filter_, ", "), "), ",
         "(", str_util::Join(dilation_, ", "), "), ",
@@ -148,12 +152,13 @@ class ConvParameters {
 
  protected:
   using ParameterDataType =
-      std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
-                 SpatialArray, SpatialArray, DataType, int>;
+      std::tuple<int64, int64, SpatialArray, TensorFormat, int64, SpatialArray,
+                 SpatialArray, SpatialArray, SpatialArray, DataType, int>;
 
   ParameterDataType get_data_as_tuple() const {
-    return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
-                           dilation_, stride_, padding_, dtype_, device_id_);
+    return std::make_tuple(batch_, in_depths_, in_, data_format_, out_depths_,
+                           filter_, dilation_, stride_, padding_, dtype_,
+                           device_id_);
   }
 
   uint64 hash_code_;
@@ -178,6 +183,7 @@ class ConvParameters {
   int64 in_depths_;
   int64 out_depths_;
   SpatialArray in_;
+  TensorFormat data_format_;
   SpatialArray filter_;
   SpatialArray dilation_;
   SpatialArray stride_;
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 4f9a96ce1797a379aa8a02890f93c70e4ed8058d..c281153795919e45c16ac7f3a88be28c0df05f25 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -44,41 +44,43 @@ struct ConvParametersPeer {
 
 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
   ConvParametersPeer conv_params_small = {{
-      1,         // batch
-      32,        // in_depths
-      {{300,     // in_rows
-        300}},   // in_cols
-      128,       // out_depths
-      {{3,       // filter_rows
-        3}},     // filter_cols
-      {{1,       // dilation_rows
-        1}},     // dilation_cols
-      {{1,       // stride_rows
-        1}},     // stride_cols
-      {{0,       // padding_rows
-        0}},     // padding_cols
-      DT_FLOAT,  // tensor datatype
-      0,         // device_id
+      1,            // batch
+      32,           // in_depths
+      {{300,        // in_rows
+        300}},      // in_cols
+      FORMAT_NCHW,  // compute_data_format
+      128,          // out_depths
+      {{3,          // filter_rows
+        3}},        // filter_cols
+      {{1,          // dilation_rows
+        1}},        // dilation_cols
+      {{1,          // stride_rows
+        1}},        // stride_cols
+      {{0,          // padding_rows
+        0}},        // padding_cols
+      DT_FLOAT,     // tensor datatype
+      0,            // device_id
   }};
   EXPECT_TRUE(
       conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
 
   ConvParametersPeer conv_params_large = {{
-      1,         // batch
-      128,       // in_depths
-      {{300,     // in_rows
-        300}},   // in_cols
-      768,       // out_depths
-      {{3,       // filter_rows
-        3}},     // filter_cols
-      {{1,       // dilation_rows
-        1}},     // dilation_cols
-      {{1,       // stride_rows
-        1}},     // stride_cols
-      {{0,       // padding_rows
-        0}},     // padding_cols
-      DT_FLOAT,  // tensor datatype
-      0,         // device_id
+      1,            // batch
+      128,          // in_depths
+      {{300,        // in_rows
+        300}},      // in_cols
+      FORMAT_NCHW,  // compute_data_format
+      768,          // out_depths
+      {{3,          // filter_rows
+        3}},        // filter_cols
+      {{1,          // dilation_rows
+        1}},        // dilation_cols
+      {{1,          // stride_rows
+        1}},        // stride_cols
+      {{0,          // padding_rows
+        0}},        // padding_cols
+      DT_FLOAT,     // tensor datatype
+      0,            // device_id
   }};
   EXPECT_FALSE(
       conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 22524dc1c4d9d53d906410edd0aef5d0dd3f7fa8..99d01b4db6bac68d890d93ac55bea576f43a5994 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
-#include "tensorflow/core/kernels/crop_resize_bilinear_core.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
@@ -43,10 +42,6 @@ limitations under the License.
 using stream_executor::cuda::ScopedActivateExecutorContext;
 #endif  // GOOGLE_CUDA
 
-using ::tensorflow::internal::CachedInterpolation;
-using ::tensorflow::internal::compute_interpolation_weights;
-using ::tensorflow::internal::crop_resize_single_image;
-
 namespace tensorflow {
 namespace {
 
@@ -254,34 +249,39 @@ struct CropAndResize<CPUDevice, T> {
             continue;
           }
           if (method_name == "bilinear") {
-            CachedInterpolation *interp_x = nullptr, *interp_y = nullptr;
-            int min_ix, max_ix, min_iy, max_iy;
-            compute_interpolation_weights(crop_width, image_width, x1, x2,
-                                          min_ix, max_ix, interp_x);
-            compute_interpolation_weights(crop_height, image_height, y1, y2,
-                                          min_iy, max_iy, interp_y);
-
-            // multiply by depth to avoid multiplication in resize_single_image.
-            for (int i = min_ix; i <= max_ix; ++i) {
-              interp_x[i - min_ix].lower *= depth;
-              interp_x[i - min_ix].upper *= depth;
-            }
+            const int top_y_index = floorf(in_y);
+            const int bottom_y_index = ceilf(in_y);
+            const float y_lerp = in_y - top_y_index;
+
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int left_x_index = floorf(in_x);
+              const int right_x_index = ceilf(in_x);
+              const float x_lerp = in_x - left_x_index;
 
-            crop_resize_single_image<T, float>(
-                image.data() + static_cast<int64>(b_in) *
-                                   static_cast<int64>(image_height) *
-                                   static_cast<int64>(image_width) *
-                                   static_cast<int64>(depth),
-                image_height, image_width, crop_height, crop_width, depth,
-                min_ix, max_ix, interp_x, min_iy, max_iy, interp_y,
-                extrapolation_value, false, false,
-                crops.data() + static_cast<int64>(b) *
-                                   static_cast<int64>(crop_height) *
-                                   static_cast<int64>(crop_width) *
-                                   static_cast<int64>(depth));
-
-            delete[] interp_y;
-            delete[] interp_x;
+              for (int d = 0; d < depth; ++d) {
+                const float top_left(static_cast<float>(
+                    image(b_in, top_y_index, left_x_index, d)));
+                const float top_right(static_cast<float>(
+                    image(b_in, top_y_index, right_x_index, d)));
+                const float bottom_left(static_cast<float>(
+                    image(b_in, bottom_y_index, left_x_index, d)));
+                const float bottom_right(static_cast<float>(
+                    image(b_in, bottom_y_index, right_x_index, d)));
+                const float top = top_left + (top_right - top_left) * x_lerp;
+                const float bottom =
+                    bottom_left + (bottom_right - bottom_left) * x_lerp;
+                crops(b, y, x, d) = top + (bottom - top) * y_lerp;
+              }
+            }
           } else {  // method == "nearest"
             for (int x = 0; x < crop_width; ++x) {
               const float in_x = (crop_width > 1)
diff --git a/tensorflow/core/kernels/crop_resize_bilinear_core.h b/tensorflow/core/kernels/crop_resize_bilinear_core.h
deleted file mode 100644
index 51327aca67ae08f559272aa9a5e8bf85b113be25..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/crop_resize_bilinear_core.h
+++ /dev/null
@@ -1,464 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
-#define TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
-
-namespace tensorflow {
-namespace internal {
-// Compute the interpolation indices only once.
-struct CachedInterpolation {
-  int lower;  // Lower source index used in the interpolation
-  int upper;  // Upper source index used in the interpolation
-  // 1-D linear iterpolation scale (see:
-  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
-  float lerp;
-};
-
-inline bool compute_single_interpolation_weight(
-    const int in_size, const float out2in_scale, const float out2in_start,
-    const bool clip, const int i, int* lower, int* upper, float* lerp) {
-  const float in = i * out2in_scale + out2in_start;
-  *lower = (int)floor(in);
-  *upper = (int)ceil(in);
-  *lerp = (float)(in - (float)*lower);
-  if (clip) {
-    if (*lower < 0)
-      *lower = 0;
-    else if (*lower >= in_size)
-      *lower = in_size - 1;
-    if (*upper < 0)
-      *upper = 0;
-    else if (*upper >= in_size)
-      *upper = in_size - 1;
-    return true;
-  } else {
-    return (*lower >= 0 && *upper < in_size) ? true : false;
-  }
-}
-/**
- * Compute interpolation values for output indexes in range
- * [out_start,out_start+out_size-1].
- * Returns true if all output indexes have lower and upper (input) indexes
- * within range [0,in_size-1].
- */
-inline bool compute_interpolation_weights(const int min_i, const int max_i,
-                                          const int in_size,
-                                          const float out2in_scale,
-                                          const float out2in_start,
-                                          const bool clip,
-                                          CachedInterpolation* interpolation) {
-  bool rval = true;
-  int num_i = max_i - min_i + 1;
-  for (int i = 0; i < num_i; ++i) {
-    if (!compute_single_interpolation_weight(
-            in_size, out2in_scale, out2in_start, clip, i + min_i,
-            &interpolation[i].lower, &interpolation[i].upper,
-            &interpolation[i].lerp)) {
-      rval = false;
-    }
-  }
-  return rval;
-}
-/**
- * Compatibility method for resize_bilinear_op.cc
- */
-inline void compute_interpolation_weights(const int out_size, const int in_size,
-                                          const float out2in_scale,
-                                          CachedInterpolation* interpolation) {
-  interpolation[out_size].lower = 0;
-  interpolation[out_size].upper = 0;
-  const bool clip = true;
-  if (!compute_interpolation_weights(0, out_size - 1, in_size, out2in_scale,
-                                     0.0f, clip, interpolation)) {
-    // Should never happen, check for it anyway
-    printf(
-        "Warning! Interpolation values have lower,upper indexes outside of "
-        "range [0,in_size-1]\n");
-  }
-}
-/**
- * Compute minimum and maximum (output) i where both lower and upper (input) is
- * in range [0,in_size-1]
- * If no values of i satisfy condition, min_i = in_size, max_i = -1 and method
- * returns false.
- * Returns true if min_i >= max_i.
- */
-inline bool compute_minmax_indexes(const int out_size, const int in_size,
-                                   const float out2in_scale,
-                                   const float out2in_start, int& min_i,
-                                   int& max_i) {
-  min_i = out_size;
-  max_i = -1;
-  int lower, upper;
-  float lerp;
-  for (int i = 0; i < out_size; ++i) {
-    if (compute_single_interpolation_weight(in_size, out2in_scale, out2in_start,
-                                            false, i, &lower, &upper, &lerp)) {
-      if (i < min_i) min_i = i;
-      if (i > max_i) max_i = i;
-    }
-  }
-  return (min_i <= max_i) ? true : false;
-}
-/**
- * Compute interpolation weights for crop_and_resize_op.cc
- * Also computes extrapolation areas.
- * Returns true if at least one point requires interpolation, false otherwise.
- */
-inline bool compute_interpolation_weights(
-    const int out_size, const int in_size,
-    const float x1,  // lower bounding box, crop region starts at in_size*x1
-    const float x2,  // upper bounding box, crop region ends at in_size*x2
-    int& min_i, int& max_i, CachedInterpolation*& interpolation) {
-  float out2in_start = out_size > 1
-                           ? (float)(in_size - 1) * (float)x1
-                           : (float)(in_size - 1) * (float)(x1 + x2) / 2.0f;
-  float out2in_scale = out_size > 1 ? (float)(x2 - x1) * (float)(in_size - 1) /
-                                          (float)(out_size - 1)
-                                    : 0.0f;
-  if (compute_minmax_indexes(out_size, in_size, out2in_scale, out2in_start,
-                             min_i, max_i)) {
-    interpolation = new CachedInterpolation[max_i - min_i + 1];
-    bool all_inputs_ok =
-        compute_interpolation_weights(min_i, max_i, in_size, out2in_scale,
-                                      out2in_start, false, interpolation);
-    if (!all_inputs_ok) {
-      // should never happen, purpose of compute_minmax_indexes is to ensure
-      // that all inputs are ok.
-      printf(
-          "Error! compute_interpolation_weights returned input indexes outside "
-          "valid range - SEGV will likely ensue.\n");
-    }
-    return true;
-  } else {
-    interpolation = 0l;
-    return false;
-  }
-}
-
-/**
- * Cast float v to type U with range clamping.
- *
- * If v<min_val, return value is clamped to u_min_val. similarly if v>max_val,
- * return value is clamped to u_max_val.
- */
-template <typename U>
-U cast_to(float v, float min_val, float max_val, U u_min_val, U u_max_val);
-template <typename U>
-inline U cast_to(float v, float min_val, float max_val, U u_min_val,
-                 U u_max_val) {
-  if (v < min_val)
-    return u_min_val;
-  else if (v > max_val)
-    return u_max_val;
-  else
-    return static_cast<U>(v);
-}
-template <>
-inline float cast_to<float>(float v, float min_val, float max_val,
-                            float u_min_val, float u_max_val) {
-  return v;
-}
-
-inline float compute_lerp(const float top_left, const float top_right,
-                          const float bottom_left, const float bottom_right,
-                          const float x_lerp, const float y_lerp) {
-  const float top = top_left + (top_right - top_left) * x_lerp;
-  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-  return top + (bottom - top) * y_lerp;
-}
-
-/**
- * Computes the bilinear interpolation from the appropriate 4 float points
- * and the linear interpolation weights.
- * Accepts input tensors of type T and produces output tensors of type U.
- * Optionally flips horizontal and/or vertical axis.
- */
-template <typename T, typename U>
-void crop_resize_single_image(const T* image, const int64 in_height,
-                              const int64 in_width, const int64 out_height,
-                              const int64 out_width, const int channels,
-                              const int min_ix, const int max_ix,
-                              const CachedInterpolation* xs, const int min_iy,
-                              const int max_iy, const CachedInterpolation* ys,
-                              const float extrapolated_value, const bool flip_x,
-                              const bool flip_y,
-                              U* output) TF_ATTRIBUTE_NOINLINE;
-template <typename T, typename U>
-void crop_resize_single_image(const T* image, const int64 in_height,
-                              const int64 in_width, const int64 out_height,
-                              const int64 out_width, const int channels,
-                              const int min_ix, const int max_ix,
-                              const CachedInterpolation* xs, const int min_iy,
-                              const int max_iy, const CachedInterpolation* ys,
-                              const float extrapolated_value, const bool flip_x,
-                              const bool flip_y, U* output) {
-  const int64 in_row_size = in_width * channels;
-  const int64 out_row_size = out_width * channels;
-  U u_min_val = std::numeric_limits<U>::min();
-  U u_max_val = std::numeric_limits<U>::max();
-  float min_val = static_cast<float>(u_min_val);
-  float max_val = static_cast<float>(u_max_val);
-  U uEx =
-      cast_to<U>(extrapolated_value, min_val, max_val, u_min_val, u_max_val);
-  // low y extrapolation zone
-  if (min_iy > 0) {
-    U* p = flip_y ? output + out_row_size * (out_height - min_iy) : output;
-    int64 nn = out_row_size * (int64)min_iy;
-    for (int64 i = 0; i < nn; ++i) p[i] = uEx;
-  }
-  // high y extrapolation zone
-  if (max_iy < out_height - 1) {
-    U* p = flip_y ? output : output + out_row_size * (max_iy + 1);
-    int64 nn = out_row_size * (int64)(out_height - 1 - max_iy);
-    for (int64 i = 0; i < nn; ++i) p[i] = uEx;
-  }
-  // low x extrapolation zone
-  if (min_ix > 0) {
-    for (int iy = min_iy; iy <= max_iy; ++iy) {
-      int xx0 = flip_x ? (out_width - min_ix) * channels : 0;
-      int nxx = min_ix * channels;
-      U* p = output + xx0 +
-             out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy);
-      for (int ix = 0; ix < nxx; ++ix) {
-        p[ix] = uEx;
-      }
-    }
-  }
-  // high x extrapolation zone
-  if (max_ix < out_width - 1) {
-    for (int iy = min_iy; iy <= max_iy; ++iy) {
-      int xx0 = flip_x ? 0 : (max_ix + 1) * channels;
-      int nxx = (out_width - 1 - max_ix) * channels;
-      U* p = output + xx0 +
-             out_row_size * (int64)(flip_y ? out_height - 1 - iy : iy);
-      for (int ix = 0; ix < nxx; ++ix) {
-        p[ix] = uEx;
-      }
-    }
-  }
-  U* output_y_ptr =
-      output +
-      out_row_size * (int64)(flip_y ? out_height - 1 - min_iy : min_iy);
-  // interpolation zone
-  if (channels == 1) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower]);
-        const float top_right0(ys_input_lower_ptr[xs_upper]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        output_y_ptr[x] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 2) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        output_y_ptr[x * 2 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 2 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 3) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Read channel 2.
-        const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
-        const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
-        const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
-        const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        float result2 = compute_lerp(top_left2, top_right2, bottom_left2,
-                                     bottom_right2, xs_lerp, ys_lerp);
-        output_y_ptr[x * 3 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 3 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 3 + 2] =
-            cast_to<U>(result2, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else if (channels == 4) {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-
-        // Read channel 0.
-        const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
-        const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
-        const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
-        const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
-
-        // Read channel 1.
-        const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
-        const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
-        const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
-        const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
-
-        // Read channel 2.
-        const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
-        const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
-        const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
-        const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
-
-        // Read channel 3.
-        const float top_left3(ys_input_lower_ptr[xs_lower + 3]);
-        const float top_right3(ys_input_lower_ptr[xs_upper + 3]);
-        const float bottom_left3(ys_input_upper_ptr[xs_lower + 3]);
-        const float bottom_right3(ys_input_upper_ptr[xs_upper + 3]);
-
-        // Compute output.
-        float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                     bottom_right0, xs_lerp, ys_lerp);
-        float result1 = compute_lerp(top_left1, top_right1, bottom_left1,
-                                     bottom_right1, xs_lerp, ys_lerp);
-        float result2 = compute_lerp(top_left2, top_right2, bottom_left2,
-                                     bottom_right2, xs_lerp, ys_lerp);
-        float result3 = compute_lerp(top_left3, top_right3, bottom_left3,
-                                     bottom_right3, xs_lerp, ys_lerp);
-        output_y_ptr[x * 4 + 0] =
-            cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 1] =
-            cast_to<U>(result1, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 2] =
-            cast_to<U>(result2, min_val, max_val, u_min_val, u_max_val);
-        output_y_ptr[x * 4 + 3] =
-            cast_to<U>(result3, min_val, max_val, u_min_val, u_max_val);
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  } else {
-    for (int y = min_iy; y <= max_iy; ++y) {
-      const int iy = y - min_iy;
-      const T* ys_input_lower_ptr = image + ys[iy].lower * in_row_size;
-      const T* ys_input_upper_ptr = image + ys[iy].upper * in_row_size;
-      const float ys_lerp = ys[iy].lerp;
-      const int x0 = flip_x ? out_width - 1 - max_ix : min_ix;
-      const int x1 = flip_x ? out_width - 1 - min_ix : max_ix;
-      for (int x = x0; x <= x1; ++x) {
-        const int ix = flip_x ? out_width - 1 - min_ix - x : x - min_ix;
-        const int64 xs_lower = xs[ix].lower;
-        const int64 xs_upper = xs[ix].upper;
-        const float xs_lerp = xs[ix].lerp;
-        for (int ichan = 0; ichan < channels; ++ichan) {
-          const float top_left0(ys_input_lower_ptr[xs_lower + ichan]);
-          const float top_right0(ys_input_lower_ptr[xs_upper + ichan]);
-          const float bottom_left0(ys_input_upper_ptr[xs_lower + ichan]);
-          const float bottom_right0(ys_input_upper_ptr[xs_upper + ichan]);
-          float result0 = compute_lerp(top_left0, top_right0, bottom_left0,
-                                       bottom_right0, xs_lerp, ys_lerp);
-          output_y_ptr[x * channels + ichan] =
-              cast_to<U>(result0, min_val, max_val, u_min_val, u_max_val);
-        }
-      }
-      output_y_ptr =
-          flip_y ? output_y_ptr - out_row_size : output_y_ptr + out_row_size;
-    }
-  }
-}
-}  // namespace internal
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_KERNELS_CROP_RESIZE_BILINEAR_CORE_H_
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index c1a25767d3146abc43442cc25b48378c74f8e984..90762fb1b0c349a538a1d56f485b46a26fc37360 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -16,7 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER2(UnaryOp, CPU, "Tan", functor::tan, float, double);
+REGISTER4(UnaryOp, CPU, "Tan", functor::tan, float, double, complex64,
+          complex128);
 
 #if GOOGLE_CUDA
 REGISTER2(UnaryOp, GPU, "Tan", functor::tan, float, double);
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 754c32b6ca003fcb6aca011037a1db27cfb1c367..58ec3d449518468ad0f7d2f67171032658d026e0 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -390,7 +390,7 @@ class FeatureStatsDatasetOp : public UnaryDatasetOpKernel {
 
         for (const auto& feature_list :
              example.feature_lists().feature_list()) {
-          stats_aggregator->IncrementCounter("feature_lists_count", "reainer",
+          stats_aggregator->IncrementCounter("feature_lists_count", "trainer",
                                              1);
           for (const auto& feature : feature_list.second.feature()) {
             feature_values_list_size_sum += AddStatsFeatureValues(feature);
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 519c47533293aa7d36c7b9d2f54e1349263dc1b1..cb285bf7325b61f0507f771c23836fd9bc26eb41 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -536,6 +536,7 @@ class FakeParamOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
+REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index f99dd643f76e641490acdee8e68a2b0198730b69..d89f1592bd72d0f349b6f8a7eca64fc4d046050a 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -45,6 +45,24 @@ struct FusedBatchNorm;
 template <typename Device, typename T, typename U>
 struct FusedBatchNormGrad;
 
+template <bool IsSame, typename Y, typename X, typename T>
+struct CastIfNecessary {
+  static inline void process(
+      Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth,
+      const CPUDevice& d) {
+    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
+  }
+};
+
+template <typename Y, typename X, typename T>
+struct CastIfNecessary<true, Y, X, T> {
+  static inline void process(
+      Y& y, X& x_shifted, const Eigen::DSizes<Eigen::Index, 2>& rest_by_depth,
+      const CPUDevice& d) {
+    y.reshape(rest_by_depth).device(d) = x_shifted;
+  }
+};
+
 template <typename T, typename U>
 struct FusedBatchNorm<CPUDevice, T, U> {
   void operator()(OpKernelContext* context, const Tensor& x_input,
@@ -125,7 +143,11 @@ struct FusedBatchNorm<CPUDevice, T, U> {
     auto x_shifted =
         x_scaled + offset.reshape(one_by_depth).broadcast(bcast_spec);
 
-    y.reshape(rest_by_depth).device(d) = x_shifted.template cast<T>();
+    // Explicitly checks the types of T and U and only casts x_shifted when
+    // T != U. (Not doing so caused a 35-50% performance slowdown for
+    // some compiler flags.)
+    CastIfNecessary<std::is_same<T, U>::value, decltype(y), decltype(x_shifted),
+                    T>::process(y, x_shifted, rest_by_depth, d);
   }
 };
 
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 4e53291b7fe715fb72e81fa275608cd0e501723a..e50b7fe3bf7fb7a32820ec6f95421cb90b506c0a 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -188,12 +188,13 @@ Status DoGatherNd(OpKernelContext* c, const Tensor& params,
 
     // bad_i will only return >= 0 on CPUs right now.
     if (bad_i >= 0) {
+      auto shape = indices.shape();
+      shape.RemoveLastDims(1);
       return errors::InvalidArgument(
-          "flat indices[", bad_i, ", :] = [",
+          "indices", SliceDebugString(shape, bad_i), " = [",
           str_util::Join(
               gtl::ArraySlice<Index>(&indices_mat(bad_i, 0), indices_nd), ", "),
-          "] does not index into param (shape: ", params.shape().DebugString(),
-          ").");
+          "] does not index into param shape ", params.shape().DebugString());
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 57b7798ba04eab5d1a869d4782dfe7d0dc727df4..07e754a6efc01b6be46c6e0a692481454104d80f 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -822,6 +822,7 @@ REGISTER_KERNEL(int64, float);
 REGISTER_KERNEL(string, string);
 REGISTER_KERNEL(string, bool);
 REGISTER_KERNEL(int32, int32);
+REGISTER_KERNEL(int32, string);
 
 #undef REGISTER_KERNEL
 
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index b596dbc7824730b47f32164ac49548ce3ac709ad..80376c61aa9f255b5e730d26c09c466a2e3eae9b 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -453,10 +453,14 @@ class MatMulOp : public OpKernel {
     const Tensor& b = ctx->input(1);
 
     // Check that the dimensions of the two matrices are valid.
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
-                errors::InvalidArgument("In[0] is not a matrix"));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
-                errors::InvalidArgument("In[1] is not a matrix"));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(a.shape()),
+        errors::InvalidArgument("In[0] is not a matrix. Instead it has shape ",
+                                a.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsMatrix(b.shape()),
+        errors::InvalidArgument("In[1] is not a matrix. Instead it has shape ",
+                                b.shape().DebugString()));
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 6f490cdc23d6a066e03aeb6a54442fcb74896ebd..d8efb1be3eddb63fb57665f4600a7487bc4321ec 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -308,11 +308,9 @@ class MklConcatOp : public OpKernel {
     }
 
     if (invoke_eigen) {
-      string msg = std::string("Invoking Eigen version of Concat. Reason:") +
-                   (!is_concat_dim_channel
-                        ? std::string("Concat dimension is not channel")
-                        : std::string("Not all tensors are in Mkl layout"));
-      VLOG(1) << "_MklConcatOp: " << msg;
+      VLOG(1) << "_MklConcatOp: Invoking Eigen version of Concat. Reason:"
+              << (!is_concat_dim_channel ? "Concat dimension is not channel"
+                                         : "Not all tensors are in Mkl layout");
       CallEigenVersion(context, input_tensors, input_shapes);
       return;
     }
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 4e80f5acce65db12e6193f554d79083a8686aec0..b73a119a8896f8b11f66993b168d3235c95c376b 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -328,9 +328,8 @@ class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(
-      const MklConvBwdFilterParams& convBwdFilterDims) {
-    std::string prefix = "conv2d_bwd_filter";
+  static string CreateKey(const MklConvBwdFilterParams& convBwdFilterDims) {
+    string prefix = "conv2d_bwd_filter";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdFilterDims.src_dims);
@@ -346,13 +345,13 @@ class MklConv2DBwdFilterPrimitiveFactory : public MklPrimitiveFactory<T> {
 
   MklPrimitive* GetConv2dBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims) {
-    std::string key = CreateKey(convBwdFilterDims);
+    string key = CreateKey(convBwdFilterDims);
     return this->GetOp(key);
   }
 
   void SetConv2dBwdFilter(
       const MklConvBwdFilterParams& convBwdFilterDims, MklPrimitive* op) {
-    std::string key = CreateKey(convBwdFilterDims);
+    string key = CreateKey(convBwdFilterDims);
     this->SetOp(key, op);
   }
 };
@@ -872,12 +871,10 @@ class MklConv2DCustomBackpropFilterOp
       }
 
       // check if src and diff_dst need reorder
-      std::vector<primitive> net;
       T *src_data = nullptr;
       if (fwd_src_md.data.format != conv2d_bwd_filter->GetSrcMemoryFormat()) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
-        src.CheckReorderToOpMem(
-            bwd_filter_pd->src_primitive_desc(), &net);
+        src.CheckReorderToOpMem(bwd_filter_pd->src_primitive_desc());
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(
@@ -888,15 +885,13 @@ class MklConv2DCustomBackpropFilterOp
       if (diff_dst_md.data.format !=
           conv2d_bwd_filter->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(
-            bwd_filter_pd->diff_dst_primitive_desc(), &net);
+        diff_dst.CheckReorderToOpMem(bwd_filter_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
             diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data = static_cast<T*>(const_cast<T*>(
             diff_dst_tensor.flat<T>().data()));
       }
-      stream(stream::kind::eager).submit(net).wait();
 
       // For backward filter, convert diff_filter back to Tensorflow layout
       // Here we prepare to reorder op memory back to user memory
@@ -929,9 +924,7 @@ class MklConv2DCustomBackpropFilterOp
 
       // Reorder diff_filter back to Tensorflow layout if necessary
       if (diff_filter_reorder_required) {
-        std::vector<primitive> net;
-        diff_filter.InsertReorderToUserMem(&net);
-        stream(stream::kind::eager).submit(net).wait();
+        diff_filter.InsertReorderToUserMem();
       }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index 0af4568b47b5d88651d1841ada9826b73ca3f26f..39498f1a80c8291f8fe18d5ac02a0f32f86b2930 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -265,9 +265,8 @@ class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(
-      const MklConvBwdInputParams& convBwdInputDims) {
-    std::string prefix = "conv2d_bwd_input";
+  static string CreateKey(const MklConvBwdInputParams& convBwdInputDims) {
+    string prefix = "conv2d_bwd_input";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convBwdInputDims.diff_src_dims);
@@ -282,13 +281,13 @@ class MklConv2DBwdInputPrimitiveFactory : public MklPrimitiveFactory<T> {
 
   MklPrimitive* GetConv2dBwdInput(
       const MklConvBwdInputParams& convBwdInputDims) {
-    std::string key = CreateKey(convBwdInputDims);
+    string key = CreateKey(convBwdInputDims);
     return this->GetOp(key);
   }
 
   void SetConv2dBwdInput(
       const MklConvBwdInputParams& convBwdInputDims, MklPrimitive *op) {
-    std::string key = CreateKey(convBwdInputDims);
+    string key = CreateKey(convBwdInputDims);
     this->SetOp(key, op);
   }
 };
@@ -722,14 +721,11 @@ class MklConv2DCustomBackpropInputOp
           diff_src_tensor->flat<T>().data()));
 
       // check if filter and diff_dst need reorder
-      std::vector<primitive> net;
       T* filter_data = nullptr;
       if (fwd_filter_md.data.format !=
           conv2d_bwd_input->GetFilterMemoryFormat()) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(
-           bwd_input_pd->weights_primitive_desc(),
-           &net);
+        filter.CheckReorderToOpMem(bwd_input_pd->weights_primitive_desc());
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data = static_cast<T*>(const_cast<T*>(
@@ -740,15 +736,13 @@ class MklConv2DCustomBackpropInputOp
       if (diff_dst_md.data.format !=
           conv2d_bwd_input->GetDiffDstMemoryFormat()) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(
-           bwd_input_pd->diff_dst_primitive_desc(), &net);
+        diff_dst.CheckReorderToOpMem(bwd_input_pd->diff_dst_primitive_desc());
         diff_dst_data = static_cast<T*>(
                          diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data = static_cast<T*>(const_cast<T*>(
                          diff_dst_tensor.flat<T>().data()));
       }
-      stream(stream::kind::eager).submit(net).wait();
 
       // execute convolution input bwd
       conv2d_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index b5689732205f92f0c6ba1520eb23958f5268636b..62396eeb8b1a55fd13c29d59281e53ad40755192 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <string.h>
 #include <map>
-#include <string>
 #include <vector>
 #include <memory>
 
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/padding.h"
@@ -298,8 +298,8 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     return instance_;
   }
 
-  static std::string CreateKey(const MklConvFwdParams& convFwdDims) {
-    std::string prefix = "conv2d_fwd_";
+  static string CreateKey(const MklConvFwdParams& convFwdDims) {
+    string prefix = "conv2d_fwd_";
     FactoryKeyCreator key_creator;
     key_creator.AddAsKey(prefix);
     key_creator.AddAsKey(convFwdDims.src_dims);
@@ -314,12 +314,12 @@ class MklConv2DFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 
   MklPrimitive* GetConv2DFwd(const MklConvFwdParams& convFwdDims) {
-    std::string key = CreateKey(convFwdDims);
+    string key = CreateKey(convFwdDims);
     return this->GetOp(key);
   }
 
   void SetConv2DFwd(const MklConvFwdParams& convFwdDims, MklPrimitive* op) {
-    std::string key = CreateKey(convFwdDims);
+    string key = CreateKey(convFwdDims);
     this->SetOp(key, op);
   }
 };
@@ -930,10 +930,9 @@ class MklConv2DOp : public OpKernel {
         conv2d_fwd->Execute(src_data, filter_data, dst_data);
       }
     } catch (mkldnn::error &e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                       ", message: " + std::string(e.message) +
-                       ", in file " + std::string(__FILE__) + ":" +
-                       std::to_string(__LINE__);
+      string error_msg = tensorflow::strings::StrCat(
+          "Status: ", e.status, ", message: ", string(e.message), ", in file ",
+          __FILE__, ":", __LINE__);
       OP_REQUIRES_OK(context,
         errors::Aborted("Operation received an exception:", error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index 5e1a5001dc34c1d79693b4aa60ca1610f5c8ea15..3f154ff33bc5c42600bf9570d0abe95acbdf8b6e 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
 #include <limits>
-#include <string>
 #include <vector>
 #include <memory>
 
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index c0dfed7d7d079c2b837afdb440c01687e6b6d4db..cb1eecb36aa2016af4e3b9a57e7a164624555ac4 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_POOLING_OPS_COMMON_H_
 
 #ifdef INTEL_MKL
-#include <string>
 #include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index b5c6ba1da3a1fbb11193953716b0095699541856..a7a9609c21724d278d7d091f01403f7c103138dc 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -330,6 +330,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // using device-specific threadpools when available.
     opts.runner = ctx->runner();
     opts.source_device = local_device_name_;
+    opts.allow_dead_tensors = true;
     // TODO(akshayka): Accommodate the multiple-worker scenario by adding the
     // constructed rendezvous to a rendezvous manager.
     Rendezvous* rendez = new IntraProcessRendezvous(lib->device_mgr());
diff --git a/tensorflow/core/kernels/resize_bilinear_op.cc b/tensorflow/core/kernels/resize_bilinear_op.cc
index bf9e93ef3f450f5cf686aa5d9d940b73e035fecb..dde59e8e741aca2c715aeb9d548979200af8789b 100644
--- a/tensorflow/core/kernels/resize_bilinear_op.cc
+++ b/tensorflow/core/kernels/resize_bilinear_op.cc
@@ -25,15 +25,10 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/crop_resize_bilinear_core.h"
 #include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-using ::tensorflow::internal::CachedInterpolation;
-using ::tensorflow::internal::compute_interpolation_weights;
-using ::tensorflow::internal::crop_resize_single_image;
-
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -68,6 +63,140 @@ class ResizeBilinearOp : public OpKernel {
   bool align_corners_;
 };
 
+namespace {
+// Compute the interpolation indices only once.
+struct CachedInterpolation {
+  int64 lower;  // Lower source index used in the interpolation
+  int64 upper;  // Upper source index used in the interpolation
+  // 1-D linear iterpolation scale (see:
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation)
+  float lerp;
+};
+
+inline void compute_interpolation_weights(const int64 out_size,
+                                          const int64 in_size,
+                                          const float scale,
+                                          CachedInterpolation* interpolation) {
+  interpolation[out_size].lower = 0;
+  interpolation[out_size].upper = 0;
+  for (int64 i = out_size - 1; i >= 0; --i) {
+    const float in = i * scale;
+    interpolation[i].lower = static_cast<int64>(in);
+    interpolation[i].upper = std::min(interpolation[i].lower + 1, in_size - 1);
+    interpolation[i].lerp = in - interpolation[i].lower;
+  }
+}
+
+/**
+ * Computes the bilinear interpolation from the appropriate 4 float points
+ * and the linear interpolation weights.
+ */
+inline float compute_lerp(const float top_left, const float top_right,
+                          const float bottom_left, const float bottom_right,
+                          const float x_lerp, const float y_lerp) {
+  const float top = top_left + (top_right - top_left) * x_lerp;
+  const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+  return top + (bottom - top) * y_lerp;
+}
+
+template <typename T>
+void resize_image(
+    typename TTypes<T, 4>::ConstTensor images, const int batch_size,
+    const int64 in_height, const int64 in_width, const int64 out_height,
+    const int64 out_width, const int channels,
+    const std::vector<CachedInterpolation>& xs,
+    const std::vector<CachedInterpolation>& ys,
+    typename TTypes<float, 4>::Tensor output) TF_ATTRIBUTE_NOINLINE;
+template <typename T>
+void resize_image(typename TTypes<T, 4>::ConstTensor images,
+                  const int batch_size, const int64 in_height,
+                  const int64 in_width, const int64 out_height,
+                  const int64 out_width, const int channels,
+                  const std::vector<CachedInterpolation>& xs_vec,
+                  const std::vector<CachedInterpolation>& ys,
+                  typename TTypes<float, 4>::Tensor output) {
+  const int64 in_row_size = in_width * channels;
+  const int64 in_batch_num_values = in_height * in_row_size;
+  const int64 out_row_size = out_width * channels;
+
+  const T* input_b_ptr = images.data();
+  const CachedInterpolation* xs = xs_vec.data();
+
+  if (channels == 3) {
+    float* output_y_ptr = output.data();
+    for (int b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
+        const float ys_lerp = ys[y].lerp;
+        for (int64 x = 0; x < out_width; ++x) {
+          const int64 xs_lower = xs[x].lower;
+          const int64 xs_upper = xs[x].upper;
+          const float xs_lerp = xs[x].lerp;
+
+          // Read channel 0.
+          const float top_left0(ys_input_lower_ptr[xs_lower + 0]);
+          const float top_right0(ys_input_lower_ptr[xs_upper + 0]);
+          const float bottom_left0(ys_input_upper_ptr[xs_lower + 0]);
+          const float bottom_right0(ys_input_upper_ptr[xs_upper + 0]);
+
+          // Read channel 1.
+          const float top_left1(ys_input_lower_ptr[xs_lower + 1]);
+          const float top_right1(ys_input_lower_ptr[xs_upper + 1]);
+          const float bottom_left1(ys_input_upper_ptr[xs_lower + 1]);
+          const float bottom_right1(ys_input_upper_ptr[xs_upper + 1]);
+
+          // Read channel 2.
+          const float top_left2(ys_input_lower_ptr[xs_lower + 2]);
+          const float top_right2(ys_input_lower_ptr[xs_upper + 2]);
+          const float bottom_left2(ys_input_upper_ptr[xs_lower + 2]);
+          const float bottom_right2(ys_input_upper_ptr[xs_upper + 2]);
+
+          // Compute output.
+          output_y_ptr[x * channels + 0] =
+              compute_lerp(top_left0, top_right0, bottom_left0, bottom_right0,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 1] =
+              compute_lerp(top_left1, top_right1, bottom_left1, bottom_right1,
+                           xs_lerp, ys_lerp);
+          output_y_ptr[x * channels + 2] =
+              compute_lerp(top_left2, top_right2, bottom_left2, bottom_right2,
+                           xs_lerp, ys_lerp);
+        }
+        output_y_ptr += out_row_size;
+      }
+      input_b_ptr += in_batch_num_values;
+    }
+  } else {
+    float* output_y_ptr = output.data();
+    for (int b = 0; b < batch_size; ++b) {
+      for (int64 y = 0; y < out_height; ++y) {
+        const T* ys_input_lower_ptr = input_b_ptr + ys[y].lower * in_row_size;
+        const T* ys_input_upper_ptr = input_b_ptr + ys[y].upper * in_row_size;
+        const float ys_lerp = ys[y].lerp;
+        for (int64 x = 0; x < out_width; ++x) {
+          auto xs_lower = xs[x].lower;
+          auto xs_upper = xs[x].upper;
+          auto xs_lerp = xs[x].lerp;
+          for (int c = 0; c < channels; ++c) {
+            const float top_left(ys_input_lower_ptr[xs_lower + c]);
+            const float top_right(ys_input_lower_ptr[xs_upper + c]);
+            const float bottom_left(ys_input_upper_ptr[xs_lower + c]);
+            const float bottom_right(ys_input_upper_ptr[xs_upper + c]);
+            output_y_ptr[x * channels + c] =
+                compute_lerp(top_left, top_right, bottom_left, bottom_right,
+                             xs_lerp, ys_lerp);
+          }
+        }
+        output_y_ptr += out_row_size;
+      }
+      input_b_ptr += in_batch_num_values;
+    }
+  }
+}
+
+}  // namespace
+
 // Partial specialization of ResizeBilinear functor for a CPUDevice.
 namespace functor {
 template <typename T>
@@ -83,11 +212,6 @@ struct ResizeBilinear<CPUDevice, T> {
     const int64 out_height = output.dimension(1);
     const int64 out_width = output.dimension(2);
 
-    const int64 in_row_size = in_width * channels;
-    const int64 in_batch_num_values = in_height * in_row_size;
-    const int64 out_row_size = out_width * channels;
-    const int64 out_batch_num_values = out_row_size * out_height;
-
     // Handle no-op resizes efficiently.
     if (out_height == in_height && out_width == in_width) {
       output = images.template cast<float>();
@@ -108,13 +232,8 @@ struct ResizeBilinear<CPUDevice, T> {
       xs[i].upper *= channels;
     }
 
-    for (int b = 0; b < batch_size; ++b) {
-      crop_resize_single_image(
-          images.data() + static_cast<int64>(b) * in_batch_num_values,
-          in_height, in_width, out_height, out_width, channels, 0,
-          out_width - 1, xs.data(), 0, out_height - 1, ys.data(), 0.0f, false,
-          false, output.data() + static_cast<int64>(b) * out_batch_num_values);
-    }
+    resize_image<T>(images, batch_size, in_height, in_width, out_height,
+                    out_width, channels, xs, ys, output);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index c5292e1ae17e4945b14bb2d71580278888fe3e5f..cab9eb729d21d8e7158364777a4f6703b78adba6 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -213,64 +213,32 @@ class AssignVariableOp : public OpKernel {
                     "Variable and value dtypes don't match; respectively, ",
                     dtype_, " and ", context->input(1).dtype()));
     Var* variable = nullptr;
-    OP_REQUIRES_OK(
-        context,
-        LookupOrCreateResource<Var>(
-            context, HandleFromInput(context, 0), &variable,
-            [this, context](Var** ptr) {
-              *ptr = new Var(dtype_);
-              PersistentTensor unused;
-              Tensor* tmp;
-              AllocatorAttributes attr;
-              if (!relax_constraints_) {
-                attr.set_gpu_compatible(true);
-                attr.set_nic_compatible(true);
-              }
-              TF_RETURN_IF_ERROR(context->allocate_persistent(
-                  dtype_, context->input(1).shape(), &unused, &tmp, attr));
-              *(*ptr)->tensor() = *tmp;
-              return Status::OK();
-            }));
+    const Tensor& value = context->input(1);
+    // Note: every resource-variable-manipulating op assumes copy-on-write
+    // semantics, and creates a copy of the variable's Tensor if its refcount is
+    // bigger than 1 when we try to modify it. This means we never need to copy
+    // the original tensor for AssignVariableOp; even if there are other live
+    // users of it we know none can modify it so this is always safe (even in
+    // esoteric cases where the same tensor is used to initialize multiple
+    // variables or the tensor is a constant this is safe, as future writes will
+    // trigger copies).
+    OP_REQUIRES_OK(context, LookupOrCreateResource<Var>(
+                                context, HandleFromInput(context, 0), &variable,
+                                [this, &value](Var** ptr) {
+                                  *ptr = new Var(dtype_);
+                                  *(*ptr)->tensor() = value;
+                                  (*ptr)->is_initialized = true;
+                                  return Status::OK();
+                                }));
     core::ScopedUnref s(variable);
-
     OP_REQUIRES(context, variable->tensor()->dtype() == dtype_,
                 errors::InvalidArgument(
                     "Trying to assign variable with wrong dtype. Expected ",
                     DataTypeString(variable->tensor()->dtype()), " got ",
                     DataTypeString(dtype_)));
-
-    const Tensor& value = context->input(1);
-    AllocatorAttributes attr;
-    if (!relax_constraints_) {
-      attr.set_gpu_compatible(true);
-      attr.set_nic_compatible(true);
-    }
-
-    // Copying is unnecessary if we are the last user of the value
-    // tensor, we can just adopt the input tensor's buffer instead.
-    std::unique_ptr<Tensor> input_alias = context->forward_input(
-        1, OpKernelContext::Params::kNoReservation /*output_index*/, dtype_,
-        value.shape(), DEVICE_MEMORY, attr);
     mutex_lock ml(*variable->mu());
     variable->is_initialized = true;
-    if (input_alias) {
-      *variable->tensor() = *input_alias;
-      return;
-    }
-
-    // Need to copy, but maybe we can re-use variable's buffer?
-    if (!variable->tensor()->RefCountIsOne() ||
-        !variable->tensor()->shape().IsSameSize(value.shape())) {
-      // Copy to new buffer
-      PersistentTensor unused;
-      Tensor* tmp;
-      OP_REQUIRES_OK(context, context->allocate_persistent(
-                                  dtype_, value.shape(), &unused, &tmp, attr));
-      *variable->tensor() = *tmp;
-    }
-    functor::DenseUpdate<Device, T, ASSIGN> copy_functor;
-    copy_functor(context->eigen_device<Device>(), variable->tensor()->flat<T>(),
-                 value.flat<T>());
+    *variable->tensor() = value;
   }
 
  private:
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index 5036d9704b680727e2afb9f04d53ef75f237857b..4a2313523da4a2605c1d0e52f3220e18a3466f4e 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -227,6 +228,89 @@ void RestoreTensor(OpKernelContext* context,
 #undef READER_COPY
 }
 
+namespace {
+
+// Tensors larger than this threshold will be restored from a thread-pool.
+const int64 kLargeShapeThreshold = 16 << 20;  // 16M
+
+// A restore operation for a single tensor.  Small tensors may be restored
+// directly from the op thread to improve read locality.  Large tensors can be
+// restored from a thread pool: this requires creating a separate BundleReader
+// for each restore.
+struct RestoreOp {
+  RestoreOp& operator=(const RestoreOp&) = delete;
+
+  bool should_run_in_pool(BundleReader* reader) const {
+    TensorShape restored_full_shape;
+
+    // Ignore status here; we'll catch the error later.
+    if (!reader->LookupTensorShape(tensor_name, &restored_full_shape).ok()) {
+      return false;
+    }
+
+    return restored_full_shape.num_elements() > kLargeShapeThreshold;
+  }
+
+  // Run this restore operation using a new BundleReader.
+  void run_with_new_reader() {
+    BundleReader reader(Env::Default(), reader_prefix);
+    if (!reader.status().ok()) {
+      status = reader.status();
+      return;
+    }
+
+    status = run(&reader);
+  }
+
+  Status run(BundleReader* reader) {
+    TensorShape restored_full_shape;
+    TF_RETURN_IF_ERROR(
+        reader->LookupTensorShape(tensor_name, &restored_full_shape));
+
+    VLOG(1) << "Restoring tensor " << idx << " : " << tensor_name << " : "
+            << restored_full_shape.num_elements();
+    Tensor* restored_tensor;
+    if (shape_and_slice.empty()) {
+      // Lookup the full tensor.
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(idx, restored_full_shape, &restored_tensor));
+      TF_RETURN_IF_ERROR(reader->Lookup(tensor_name, restored_tensor));
+    } else {
+      // Lookup the slice.
+      TensorShape parsed_full_shape;
+      TensorSlice parsed_slice;
+      TensorShape parsed_slice_shape;
+
+      TF_RETURN_IF_ERROR(
+          checkpoint::ParseShapeAndSlice(shape_and_slice, &parsed_full_shape,
+                                         &parsed_slice, &parsed_slice_shape));
+
+      if (!restored_full_shape.IsSameSize(parsed_full_shape)) {
+        return errors::InvalidArgument(
+            "tensor_name = ", tensor_name, "; shape in shape_and_slice spec ",
+            parsed_full_shape.DebugString(),
+            " does not match the shape stored in checkpoint: ",
+            restored_full_shape.DebugString());
+      }
+      TF_RETURN_IF_ERROR(
+          context->allocate_output(idx, parsed_slice_shape, &restored_tensor));
+      TF_RETURN_IF_ERROR(
+          reader->LookupSlice(tensor_name, parsed_slice, restored_tensor));
+    }
+    return Status::OK();
+  }
+
+  OpKernelContext* context;
+  size_t idx;
+  string tensor_name;
+  string shape_and_slice;
+  string reader_prefix;
+
+  ::tensorflow::Status status;
+};
+
+}  // namespace
+
 Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
                         const Tensor& tensor_names,
                         const Tensor& shape_and_slices,
@@ -244,20 +328,20 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
               return tensor_names_flat(a) < tensor_names_flat(b);
             });
 
-  BundleReader reader(Env::Default(), prefix_string);
-  TF_RETURN_IF_ERROR(reader.status());
+  std::vector<std::unique_ptr<RestoreOp> > pool_restore_ops;
+  std::vector<std::unique_ptr<RestoreOp> > direct_restore_ops;
+
+  BundleReader default_reader(Env::Default(), prefix_string);
+  TF_RETURN_IF_ERROR(default_reader.status());
 
-  // TODO(zongheng): potential optimization: one Seek() in first lookup.
-  // TODO(zongheng): consider measuring speed and issuing concurrent lookups
-  // within a fixed memory budget.
-  TensorShape restored_full_shape;
   std::vector<string> mismatched_errors;
-  for (size_t i : sorted_name_idx) {
+  for (const size_t i : sorted_name_idx) {
+    TensorShape restored_full_shape;
     DataType original_dtype;
     const string& tensor_name = tensor_names_flat(i);
     TF_RETURN_IF_ERROR(
-        reader.LookupDtypeAndShape(tensor_name,
-                                   &original_dtype, &restored_full_shape));
+        default_reader.LookupDtypeAndShape(
+            tensor_name, &original_dtype, &restored_full_shape));
     if (dtypes[i] != original_dtype) {
       string error_msg = strings::StrCat(
           "tensor_name = ", tensor_name,
@@ -271,48 +355,51 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
     return errors::InvalidArgument(error_msg);
   }
 
-  Tensor* restored_tensor = nullptr;
   for (auto i : sorted_name_idx) {
     const string& tensor_name = tensor_names_flat(i);
     const string& shape_and_slice = shape_and_slices_flat(i);
-
-    TF_RETURN_IF_ERROR(
-        reader.LookupTensorShape(tensor_name, &restored_full_shape));
-
-    if (shape_and_slice.empty()) {
-      // Lookup the full tensor.
-      TF_RETURN_IF_ERROR(
-          context->allocate_output(i, restored_full_shape, &restored_tensor));
-      TF_RETURN_IF_ERROR(reader.Lookup(tensor_name, restored_tensor));
+    auto op =
+        new RestoreOp{context, i, tensor_name, shape_and_slice, prefix_string};
+    if (op->should_run_in_pool(&default_reader)) {
+      pool_restore_ops.emplace_back(op);
     } else {
-      // Lookup the slice.
-      TensorShape parsed_full_shape;
-      TensorSlice parsed_slice;
-      TensorShape parsed_slice_shape;
+      direct_restore_ops.emplace_back(op);
+    }
+  }
 
-      TF_RETURN_IF_ERROR(
-          checkpoint::ParseShapeAndSlice(shape_and_slice, &parsed_full_shape,
-                                         &parsed_slice, &parsed_slice_shape));
-      if (!restored_full_shape.IsSameSize(parsed_full_shape)) {
-        return errors::InvalidArgument(
-            "tensor_name = ", tensor_name, "; shape in shape_and_slice spec ",
-            parsed_full_shape.DebugString(),
-            " does not match the shape stored in checkpoint: ",
-            restored_full_shape.DebugString());
+  {
+    // Schedule any threaded operations first, skipping thread pool creation if
+    // we don't have any expensive operations.
+    std::unique_ptr<thread::ThreadPool> reader_pool;
+    if (!pool_restore_ops.empty()) {
+      reader_pool.reset(
+          new thread::ThreadPool(Env::Default(), "restore_tensors", 8));
+      for (auto& op : pool_restore_ops) {
+        reader_pool->Schedule([&op]() { op->run_with_new_reader(); });
       }
+    }
 
-      TF_RETURN_IF_ERROR(
-          context->allocate_output(i, parsed_slice_shape, &restored_tensor));
-      TF_RETURN_IF_ERROR(
-          reader.LookupSlice(tensor_name, parsed_slice, restored_tensor));
+    // Read small tensors from the op thread
+    for (auto& op : direct_restore_ops) {
+      TF_RETURN_IF_ERROR(op->run(&default_reader));
     }
-    if (dtypes[i] != restored_tensor->dtype()) {
+  }
+
+  // Check status of pool ops; this must come after the pool shuts down.
+  for (auto& op : pool_restore_ops) {
+    TF_RETURN_IF_ERROR(op->status);
+  }
+
+  for (auto i : sorted_name_idx) {
+    const string& tensor_name = tensor_names_flat(i);
+    if (dtypes[i] != context->mutable_output(i)->dtype()) {
       return errors::InvalidArgument(
           "tensor_name = ", tensor_name, "; expected dtype ",
           DataTypeString(dtypes[i]), " does not equal restored dtype ",
-          DataTypeString(restored_tensor->dtype()));
+          DataTypeString(context->mutable_output(i)->dtype()));
     }
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index e1fc2ea128585fded6cd345d33a54720193146ca..e0194605ce0ae03b4ab57cb26f693fc64d027d6d 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -277,6 +277,9 @@ TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_CPU);
 TF_CALL_string(REGISTER_SCATTER_ND_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
 
 // Registers GPU kernels.
 #if GOOGLE_CUDA
@@ -309,6 +312,7 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 
 TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
+TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -537,11 +541,13 @@ Status DoScatterNd(OpKernelContext* c, const Tensor& indices,
     }
   }
   if (bad_i >= 0) {
+    auto slice_shape = indices.shape();
+    slice_shape.RemoveLastDims(1);
     return errors::InvalidArgument(
-        "Invalid indices: ", SliceDebugString(indices.shape(), bad_i), " = [",
+        "indices", SliceDebugString(slice_shape, bad_i), " = [",
         str_util::Join(
             gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim), ", "),
-        "] does not index into ", shape.DebugString());
+        "] does not index into shape ", shape.DebugString());
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 7cfffa20c5a491356d5172ec4346052c67328ff9..472f5a3547aaaf0237a6d3ce51a141519c4d11a4 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -161,15 +161,16 @@ struct ScatterNdFunctor<CPUDevice, T, Index, OP, IXDIM> {
 
 TF_CALL_ALL_TYPES(REGISTER_SCATTER_ND_UPDATE);
 REGISTER_SCATTER_ND_INDEX(string, scatter_nd_op::UpdateOp::ADD);
-TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH)
-
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_MATH);
+TF_CALL_bool(REGISTER_SCATTER_ND_MATH);
 #undef REGISTER_SCATTER_ND_MATH
 #undef REGISTER_SCATTER_ND_UPDATE
 #undef REGISTER_SCATTER_ND_INDEX
 #undef REGISTER_SCATTER_ND_FULL
 
-#ifdef TENSORFLOW_USE_SYCL
 // Implementation of update functor for SYCL.
+#ifdef TENSORFLOW_USE_SYCL
+
 template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
 struct ScatterNdFunctor<SYCLDevice, T, Index, OP, IXDIM> {
   Index operator()(
diff --git a/tensorflow/core/kernels/scatter_nd_op_test.cc b/tensorflow/core/kernels/scatter_nd_op_test.cc
index c134a8dd5bcdb06445b063d4083c18e76c5f4265..95ecc69c95dd4aa566cf4a9c9a964a11353b066d 100644
--- a/tensorflow/core/kernels/scatter_nd_op_test.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_test.cc
@@ -185,7 +185,7 @@ TEST_F(ScatterNdUpdateOpTest, Error_IndexOutOfRange) {
                            {100, 101, 102, 777, 778, 779, 10000, 10001, 10002});
   Status s = RunOpKernel();
   EXPECT_TRUE(str_util::StrContains(
-      s.ToString(), "Invalid indices: [2,0] = [99] does not index into [5,3]"))
+      s.ToString(), "indices[2] = [99] does not index into shape [5,3]"))
       << s;
 }
 
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 1e3e92a68a05123bafad77348e6811a14c303301..c08ff47583409429e8091b8f058c551a036f8d21 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -304,6 +305,8 @@ class StridedSliceAssignOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(context,
                      LookupResource(context, HandleFromInput(context, 0), &v));
+      OP_REQUIRES_OK(context,
+                     PrepareToUpdateVariable<Device, T>(context, v->tensor()));
       old_lhs = *v->tensor();
       OP_REQUIRES(context, old_lhs.dtype() == DataTypeToEnum<T>::value,
                   errors::InvalidArgument(
diff --git a/tensorflow/core/kernels/training_op_helpers.cc b/tensorflow/core/kernels/training_op_helpers.cc
index f288e124ee5af0e62aed6b2f44f7933914649ce2..d3c4f620717f31df711731f7ad008133dc1faef5 100644
--- a/tensorflow/core/kernels/training_op_helpers.cc
+++ b/tensorflow/core/kernels/training_op_helpers.cc
@@ -39,8 +39,15 @@ mutex* GetTrainingVariableMutex(OpKernelContext* ctx, int input) {
 // GetInputTensor which will signal a failure.
 std::vector<mutex_lock> MaybeLockVariableInputMutexesInOrder(
     OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  bool any_resource = false;
+  for (auto i : input_ids) {
+    if (ctx->input_dtype(i) == DT_RESOURCE) {
+      any_resource = true;
+      break;
+    }
+  }
   std::vector<mutex_lock> locks;
-  if (!do_lock) {
+  if (!do_lock && !any_resource) {
     return locks;
   }
   std::vector<mutex*> mutexes;
diff --git a/tensorflow/core/kernels/training_op_helpers.h b/tensorflow/core/kernels/training_op_helpers.h
index 7e56e15450aba23e6625b27da34a29b1ad2ecce2..765335d3a071e948372032930f4ad363cfdf0c9b 100644
--- a/tensorflow/core/kernels/training_op_helpers.h
+++ b/tensorflow/core/kernels/training_op_helpers.h
@@ -80,18 +80,8 @@ Status GetInputTensorFromVariable(OpKernelContext* ctx, int input,
     Var* var;
     TF_RETURN_IF_ERROR(LookupResource(ctx, HandleFromInput(ctx, input), &var));
     core::ScopedUnref unref_var(var);
-    if (lock_held) {
-      TF_RETURN_IF_ERROR(
-          PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-      *out = *var->tensor();
-    } else {
-      mutex_lock ml(*var->mu());
-      if (!sparse) {
-        TF_RETURN_IF_ERROR(
-            PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
-      }
-      *out = *var->tensor();
-    }
+    TF_RETURN_IF_ERROR(PrepareToUpdateVariable<Device, T>(ctx, var->tensor()));
+    *out = *var->tensor();
     return Status::OK();
   }
   *out = ctx->mutable_input(input, lock_held);
diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 1c130ba30020521acd114f24df196f9df0be38ee..d6f3f26cd5b40dc4913ee0e7a99900eaca677a1a 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -45,17 +45,25 @@ typedef std::complex<double> complex128;
 struct bfloat16 {
   B16_DEVICE_FUNC bfloat16() {}
 
-  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+  B16_DEVICE_FUNC static bfloat16 truncate_to_bfloat16(const float v) {
+    bfloat16 output;
     if (float_isnan(v)) {
-      value = NAN_VALUE;
-      return;
+      output.value = NAN_VALUE;
+      return output;
     }
     const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    value = p[0];
+    output.value = p[0];
 #else
-    value = p[1];
+    output.value = p[1];
 #endif
+    return output;
+  }
+
+  B16_DEVICE_FUNC explicit bfloat16(const float v) {
+    // TODO(asabne) : change the below line to
+    // value = round_to_bfloat16(v).value;
+    value = truncate_to_bfloat16(v).value;
   }
 
   B16_DEVICE_FUNC explicit bfloat16(const double val)
@@ -169,8 +177,6 @@ struct bfloat16 {
 
   // Converts a float point to bfloat16, with round-nearest-to-even as rounding
   // method.
-  // TODO(b/69266521): Add a truncate_to_bfloat16 function and make this
-  // function as default behavior.
   // TODO: There is a slightly faster implementation (8% faster on CPU)
   // than this (documented in cl/175987786), that is exponentially harder to
   // understand and document. Switch to the faster version when converting to
diff --git a/tensorflow/core/lib/core/errors.h b/tensorflow/core/lib/core/errors.h
index 51c09032dfb47fd699f142c98a091b1fab617782..a631d9815a824d411cbe41c77f58625bb7a33ba9 100644
--- a/tensorflow/core/lib/core/errors.h
+++ b/tensorflow/core/lib/core/errors.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <sstream>
 
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -118,6 +119,25 @@ DECLARE_ERROR(Unauthenticated, UNAUTHENTICATED)
 
 #undef DECLARE_ERROR
 
+// Produces a formatted string pattern from the name which can uniquely identify
+// this node upstream to produce an informative error message. The pattern
+// followed is: {{node <name>}}
+// Note: The pattern below determines the regex _NODEDEF_NAME_RE in the file
+// tensorflow/python/client/session.py
+// LINT.IfChange
+inline string FormatNodeNameForError(const string& name) {
+  return strings::StrCat("{{node ", name, "}}");
+}
+// LINT.ThenChange(//tensorflow/python/client/session.py)
+template <typename T>
+string FormatNodeNamesForError(const T& names) {
+  ::tensorflow::str_util::Formatter<string> f(
+      [](string* output, const string& s) {
+        ::tensorflow::strings::StrAppend(output, FormatNodeNameForError(s));
+      });
+  return ::tensorflow::str_util::Join(names, ", ", f);
+}
+
 // The CanonicalCode() for non-errors.
 using ::tensorflow::error::OK;
 
diff --git a/tensorflow/core/lib/core/refcount.h b/tensorflow/core/lib/core/refcount.h
index eb41f9ff3660bfe9bf332d74f6852b933ca23858..87bcfec41199daf18037652ec9140f8a05889e68 100644
--- a/tensorflow/core/lib/core/refcount.h
+++ b/tensorflow/core/lib/core/refcount.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_LIB_CORE_REFCOUNT_H_
 
 #include <atomic>
+#include <memory>
+
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -58,6 +60,15 @@ class RefCounted {
   void operator=(const RefCounted&) = delete;
 };
 
+// A deleter class to form a std::unique_ptr that unrefs objects.
+struct RefCountDeleter {
+  void operator()(tensorflow::core::RefCounted* o) const { o->Unref(); }
+};
+
+// A unique_ptr that unrefs the owned object on destruction.
+template <typename T>
+using RefCountPtr = std::unique_ptr<T, RefCountDeleter>;
+
 // Helper class to unref an object when out-of-scope.
 class ScopedUnref {
  public:
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index cb6943379d4ebe38c79ba9097d4c3183c7b8c205..cf11f3a331e6746374d6d7a8b8197cde514c8386 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -112,6 +112,7 @@ Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
 /* static */
 Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
   flags |= SQLITE_OPEN_PRIVATECACHE;
+  flags |= SQLITE_OPEN_URI;
   sqlite3* sqlite = nullptr;
   int rc = sqlite3_open_v2(path.c_str(), &sqlite, flags, nullptr);
   if (rc != SQLITE_OK) {
diff --git a/tensorflow/core/lib/io/record_reader_writer_test.cc b/tensorflow/core/lib/io/record_reader_writer_test.cc
index 95ac040602d3c22e488792f1d83dd85449b980b7..c36c909399efcdfb58543c21f7ff6c2719eee556 100644
--- a/tensorflow/core/lib/io/record_reader_writer_test.cc
+++ b/tensorflow/core/lib/io/record_reader_writer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
 
+#include <zlib.h>
 #include <vector>
 #include "tensorflow/core/platform/env.h"
 
@@ -33,6 +34,89 @@ static std::vector<int> BufferSizes() {
           12, 13, 14, 15, 16, 17, 18, 19, 20, 65536};
 }
 
+namespace {
+
+io::RecordReaderOptions GetMatchingReaderOptions(
+    const io::RecordWriterOptions& options) {
+  if (options.compression_type == io::RecordWriterOptions::ZLIB_COMPRESSION) {
+    return io::RecordReaderOptions::CreateRecordReaderOptions("ZLIB");
+  }
+  return io::RecordReaderOptions::CreateRecordReaderOptions("");
+}
+
+uint64 GetFileSize(const string& fname) {
+  Env* env = Env::Default();
+  uint64 fsize;
+  TF_CHECK_OK(env->GetFileSize(fname, &fsize));
+  return fsize;
+}
+
+void VerifyFlush(const io::RecordWriterOptions& options) {
+  std::vector<string> records = {
+      "abcdefghijklmnopqrstuvwxyz",
+      "ZYXWVUTSRQPONMLKJIHGFEDCBA0123456789!@#$%^&*()",
+      "G5SyohOL9UmXofSOOwWDrv9hoLLMYPJbG9r38t3uBRcHxHj2PdKcPDuZmKW62RIY",
+      "aaaaaaaaaaaaaaaaaaaaaaaaaa",
+  };
+
+  Env* env = Env::Default();
+  string fname = testing::TmpDir() + "/record_reader_writer_flush_test";
+
+  std::unique_ptr<WritableFile> file;
+  TF_CHECK_OK(env->NewWritableFile(fname, &file));
+  io::RecordWriter writer(file.get(), options);
+
+  std::unique_ptr<RandomAccessFile> read_file;
+  TF_CHECK_OK(env->NewRandomAccessFile(fname, &read_file));
+  io::RecordReaderOptions read_options = GetMatchingReaderOptions(options);
+  io::RecordReader reader(read_file.get(), read_options);
+
+  EXPECT_EQ(GetFileSize(fname), 0);
+  for (size_t i = 0; i < records.size(); i++) {
+    uint64 start_size = GetFileSize(fname);
+
+    // Write a new record.
+    TF_EXPECT_OK(writer.WriteRecord(records[i]));
+    TF_CHECK_OK(writer.Flush());
+    TF_CHECK_OK(file->Flush());
+
+    // Verify that file size has changed after file flush.
+    uint64 new_size = GetFileSize(fname);
+    EXPECT_GT(new_size, start_size);
+
+    // Verify that file has all records written so far and no more.
+    uint64 offset = 0;
+    string record;
+    for (size_t j = 0; j <= i; j++) {
+      // Check that j'th record is written correctly.
+      TF_CHECK_OK(reader.ReadRecord(&offset, &record));
+      EXPECT_EQ(record, records[j]);
+    }
+
+    // Verify that file has no more records.
+    CHECK_EQ(reader.ReadRecord(&offset, &record).code(), error::OUT_OF_RANGE);
+  }
+}
+
+}  // namespace
+
+TEST(RecordReaderWriterTest, TestFlush) {
+  io::RecordWriterOptions options;
+  VerifyFlush(options);
+}
+
+TEST(RecordReaderWriterTest, TestZlibSyncFlush) {
+  io::RecordWriterOptions options;
+  options.compression_type = io::RecordWriterOptions::ZLIB_COMPRESSION;
+  // The default flush_mode is Z_NO_FLUSH and only writes to the file when the
+  // buffer is full or the file is closed, which makes testing harder.
+  // By using Z_SYNC_FLUSH the test can verify Flush does write out records of
+  // approximately the right size at the right times.
+  options.zlib_options.flush_mode = Z_SYNC_FLUSH;
+
+  VerifyFlush(options);
+}
+
 TEST(RecordReaderWriterTest, TestBasics) {
   Env* env = Env::Default();
   string fname = testing::TmpDir() + "/record_reader_writer_test";
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 02989f8d3d5be2020459620afc2bc58ab6abb22d..d6ae75473f7fae078282ed65f6098b8544d6ec77 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2881,7 +2881,7 @@ REGISTER_OP("ScatterNdNonAliasingAdd")
     .Input("indices: Tindices")
     .Input("updates: T")
     .Output("output: T")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index e91089e627cf2f540d10531636f0f1175a0c2f05..4ac8e15160f12a251034fd4a533571a8603c9105 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12073,6 +12073,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "Cast"
+  input_arg {
+    name: "x"
+    type_attr: "SrcT"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "DstT"
+  }
+  attr {
+    name: "SrcT"
+    type: "type"
+  }
+  attr {
+    name: "DstT"
+    type: "type"
+  }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
 op {
   name: "Ceil"
   input_arg {
@@ -55242,6 +55268,61 @@ op {
     }
   }
 }
+op {
+  name: "ScatterNdNonAliasingAdd"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BOOL
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "ScatterNdSub"
   input_arg {
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index 444aa8b9544c62d81f288f21e4eaaac23d8691cb..2059741da9ebf3f774e207bad67c89c3e2eb358a 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -140,11 +140,13 @@ REGISTER_OP("LookupTableSize")
     .Input("table_handle: Ref(string)")
     .Output("size: int64")
     .SetShapeFn(TwoElementVectorInputsAndScalarOutputs);
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSize");
 
 REGISTER_OP("LookupTableSizeV2")
     .Input("table_handle: resource")
     .Output("size: int64")
     .SetShapeFn(ScalarAndTwoElementVectorInputsAndScalarOutputs);
+WHITELIST_STATEFUL_OP_FOR_DATASET_FUNCTIONS("LookupTableSizeV2");
 
 REGISTER_OP("LookupTableExport")
     .Input("table_handle: Ref(string)")
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 4b0591c6e8a050aedeff616cb47854510212e547..1667c398f41d9ff9da60afeda64a0e595cda2c39 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -114,6 +114,7 @@ REGISTER_OP("Cast")
     .Output("y: DstT")
     .Attr("SrcT: type")
     .Attr("DstT: type")
+    .Attr("Truncate: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
 
 REGISTER_OP("_HostCast")
@@ -121,6 +122,7 @@ REGISTER_OP("_HostCast")
     .Output("y: DstT")
     .Attr("SrcT: type")
     .Attr("DstT: type")
+    .Attr("Truncate: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Cast x of type SrcT to y of DstT.
@@ -1504,6 +1506,13 @@ REGISTER_OP("QuantizedAdd")
     .SetIsCommutative()
     .SetShapeFn([](InferenceContext* c) {
       TF_RETURN_IF_ERROR(shape_inference::BroadcastBinaryOpShapeFn(c));
+      // min_x, max_x, min_y, max_y should be scalar.
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/math_ops_test.cc b/tensorflow/core/ops/math_ops_test.cc
index 25dc033065eb3284b139f8bc2428bb20c2deba60..23f1538912c1187d965e668eecebf3774f9eae68 100644
--- a/tensorflow/core/ops/math_ops_test.cc
+++ b/tensorflow/core/ops/math_ops_test.cc
@@ -543,4 +543,19 @@ TEST(MathOpsTest, HistogramFixedWidth_ShapeFn) {
   INFER_OK(op, "[?];[2];[]", "[?]");
   INFER_OK(op, "[?];[2];?", "[?]");
 }
+
+TEST(MathOpsTest, QuantizedAdd_ShapeFn) {
+  ShapeInferenceTestOp op("QuantizedAdd");
+
+  INFER_OK(op, "?;?;?;?;?;?", "?;[];[]");
+  INFER_OK(op, "?;?;[];[];[];[]", "?;[];[]");
+  INFER_OK(op, "[1,2];?;[];[];[];[]", "?;[];[]");
+  INFER_OK(op, "[];[2];[];[];[];[]", "[d1_0];[];[]");
+
+  // Rank checks on input scalars.
+  INFER_ERROR("must be rank 0", op, "?;?;[1];?;?;?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;[2];?;?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;?;[3];?");
+  INFER_ERROR("must be rank 0", op, "?;?;?;?;?;[4]");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 6f07dd612e562a6b38be3fb9d95ced566b87378c..22a2f423c298e600fc68dec228061e7914a4f884 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4977,6 +4977,13 @@ op {
     name: "DstT"
     type: "type"
   }
+  attr {
+    name: "Truncate"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "Ceil"
@@ -26182,6 +26189,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BOOL
       }
     }
   }
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 09029a4b256beceeb69c735c15bb1587cb1e06ac..3a012c23fd2313e0a4ecf7b3a20a89c930a7cc51 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -58,3 +58,9 @@ def if_static(extra_deps, otherwise=[]):
       str(Label("//tensorflow:framework_shared_object")): otherwise,
       "//conditions:default": extra_deps,
   })
+
+def if_dynamic_kernels(extra_deps, otherwise=[]):
+  return select({
+      str(Label("//tensorflow:dynamic_loaded_kernels")): extra_deps,
+      "//conditions:default": otherwise,
+  })
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index c461a40086360f085096a9e4dd0ab8e848d8b362..305a9a682f7969a6048192186fc83f67ac9fc50a 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -86,7 +86,7 @@ TEST_F(DefaultEnvTest, IncompleteReadOutOfRange) {
 
 TEST_F(DefaultEnvTest, ReadFileToString) {
   for (const int length : {0, 1, 1212, 2553, 4928, 8196, 9000, (1 << 20) - 1,
-                           1 << 20, (1 << 20) + 1}) {
+                           1 << 20, (1 << 20) + 1, (256 << 20) + 100}) {
     const string filename = strings::StrCat(BaseDir(), "/bar/..//file", length);
 
     // Write a file with the given length
diff --git a/tensorflow/core/platform/windows/port.cc b/tensorflow/core/platform/windows/port.cc
index f2aaf13bece026ce1101a80c30f799d5c06a439f..5375f563729a56f480186806f5d2869821b05cfb 100644
--- a/tensorflow/core/platform/windows/port.cc
+++ b/tensorflow/core/platform/windows/port.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -57,6 +58,17 @@ int NumSchedulableCPUs() {
   return system_info.dwNumberOfProcessors;
 }
 
+bool NUMAEnabled() {
+  // Not yet implemented: coming soon.
+  return false;
+}
+
+int NUMANumNodes() { return 1; }
+
+void NUMASetThreadNodeAffinity(int node) {}
+
+int NUMAGetThreadNodeAffinity() { return kNUMANoAffinity; }
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #ifdef TENSORFLOW_USE_JEMALLOC
   void* ptr = NULL;
@@ -108,6 +120,14 @@ void Free(void* ptr) {
 #endif
 }
 
+void* NUMAMalloc(int node, size_t size, int minimum_alignment) {
+  return AlignedMalloc(size, minimum_alignment);
+}
+
+void NUMAFree(void* ptr, size_t size) { Free(ptr); }
+
+int NUMAGetMemAffinity(const void* addr) { return kNUMANoAffinity; }
+
 void MallocExtension_ReleaseToSystem(std::size_t num_bytes) {
   // No-op.
 }
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 22a2691dcc9a7317f5205c6ae5744100ee2cfd35..d701ce8e1243f9a0f8b5c206b17bbaf635be1661 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -416,6 +416,11 @@ message RunOptions {
   int64 timeout_in_ms = 2;
 
   // The thread pool to use, if session_inter_op_thread_pool is configured.
+  // To use the caller thread set this to -1 - this uses the caller thread
+  // to execute Session::Run() and thus avoids a context switch. Using the
+  // caller thread to execute Session::Run() should be done ONLY for simple
+  // graphs, where the overhead of an additional context switch is
+  // comparable with the overhead of Session::Run().
   int32 inter_op_thread_pool = 3;
 
   // Whether the partition graph(s) executed by the executor(s) should be
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index 5b05a1b3ee1fdd6fb977910289b8bb8fe30e91c0..63ba4eb173ce9a73ee23919a2f59a2151b8f2771 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -8,6 +8,7 @@ import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/versions.proto";
 import "tensorflow/core/protobuf/tensorflow_server.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/tensor.proto";
 
 message RemoteTensorHandle {
   // The ID of the operation that produced this tensor.
@@ -128,6 +129,24 @@ message RegisterFunctionRequest {
 message RegisterFunctionResponse {
 }
 
+message SendTensorRequest {
+  fixed64 context_id = 1;
+
+  // All remote tensors are identified by <Op ID, Output num>. To mimic this
+  // situation when directly sending tensors, we include an "artificial" op ID
+  // (which would have corresponded to the _Recv op when not using SendTensor).
+  int64 op_id = 2;
+  // The index within the repeated field is the output number that will help
+  // uniquely identify (along with the above op_id) the particular tensor.
+  repeated TensorProto tensors = 3;
+
+  // The device on which the tensors should be resident.
+  string device_name = 4;
+}
+
+message SendTensorResponse {
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly
@@ -174,4 +193,8 @@ service EagerService {
   // Takes a FunctionDef and makes it enqueable on the remote worker.
   rpc RegisterFunction(RegisterFunctionRequest)
       returns (RegisterFunctionResponse);
+
+  // An RPC to push tensors to the server. At times, certain environments don't
+  // allow the server to connect back to the client.
+  rpc SendTensor(SendTensorRequest) returns (SendTensorResponse);
 }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index cb1fd09dbbbae66bd47a58ada9a90fa6dafba586..cea5e8ffb04c2e9143f9aad281e5bc0dd5799003 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc0"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/util/equal_graph_def_test.cc b/tensorflow/core/util/equal_graph_def_test.cc
index c54540332e8522864ed9f4277c00e65857acf42f..77ca8eaec3680468c199e2eb09ecb86aba0b89fc 100644
--- a/tensorflow/core/util/equal_graph_def_test.cc
+++ b/tensorflow/core/util/equal_graph_def_test.cc
@@ -85,7 +85,7 @@ TEST_F(EqualGraphDefTest, NoMatch) {
   Input(e_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Did not find expected node 'A = Input[]()'", diff_);
+  EXPECT_EQ("Did not find expected node '{{node A}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, MissingNode) {
@@ -93,7 +93,7 @@ TEST_F(EqualGraphDefTest, MissingNode) {
   Input(e_.opts().WithName("B"));
   Input(a_.opts().WithName("A"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Did not find expected node 'B = Input[]()'", diff_);
+  EXPECT_EQ("Did not find expected node '{{node B}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, ExtraNode) {
@@ -101,7 +101,7 @@ TEST_F(EqualGraphDefTest, ExtraNode) {
   Input(a_.opts().WithName("A"));
   Input(a_.opts().WithName("B"));
   EXPECT_FALSE(Match());
-  EXPECT_EQ("Found unexpected node 'B = Input[]()'", diff_);
+  EXPECT_EQ("Found unexpected node '{{node B}} = Input[]()'", diff_);
 }
 
 TEST_F(EqualGraphDefTest, NodeOrder) {
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index bb447e03938024f4eede8955b81b8e7458590913..566a42dbd5beb23fb847c6e13d69697f9b482251 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_
 #ifdef INTEL_MKL
 
-#include <string>
 #include <vector>
 #include <unordered_map>
 #include <utility>
@@ -1896,16 +1895,17 @@ class MklPrimitiveFactory {
   MklPrimitiveFactory() {}
   ~MklPrimitiveFactory() {}
 
-  MklPrimitive* GetOp(const std::string& key) {
+  MklPrimitive* GetOp(const string& key) {
     auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
     if (stream_iter == MklPrimitiveFactory<T>::GetHashMap().end()) {
       return nullptr;
     } else {
+      CHECK(stream_iter->second != nullptr) << "nullptr present in map";
       return stream_iter->second;
     }
   }
 
-  void SetOp(const std::string& key, MklPrimitive* op) {
+  void SetOp(const string& key, MklPrimitive* op) {
     auto stream_iter = MklPrimitiveFactory<T>::GetHashMap().find(key);
 
     CHECK(stream_iter == MklPrimitiveFactory<T>::GetHashMap().end());
@@ -1914,8 +1914,8 @@ class MklPrimitiveFactory {
   }
 
  private:
-  static inline std::unordered_map<std::string, MklPrimitive*>& GetHashMap() {
-    static thread_local std::unordered_map<std::string, MklPrimitive*> map_;
+  static inline std::unordered_map<string, MklPrimitive*>& GetHashMap() {
+    static thread_local std::unordered_map<string, MklPrimitive*> map_;
     return map_;
   }
 };
@@ -1943,9 +1943,7 @@ class FactoryKeyCreator {
     Append(StringPiece(buffer, sizeof(T)));
   }
 
-  std::string GetKey() {
-    return key_;
-  }
+  string GetKey() { return key_; }
 
  private:
   string key_;
@@ -2020,8 +2018,8 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     MklReorderPrimitiveFactory() {};
     ~MklReorderPrimitiveFactory() {};
 
-    static std::string CreateKey(const memory* from, const memory* to) {
-      std::string prefix = "reorder";
+    static string CreateKey(const memory* from, const memory* to) {
+      string prefix = "reorder";
       FactoryKeyCreator key_creator;
       auto const &from_desc =  from->get_primitive_desc().desc().data;
       auto const &to_desc =  to->get_primitive_desc().desc().data;
@@ -2038,12 +2036,12 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
     }
 
     MklPrimitive* GetReorder(const memory* from, const memory* to) {
-      std::string key = CreateKey(from, to);
+      string key = CreateKey(from, to);
       return this->GetOp(key);
     }
 
     void SetReorder(const memory* from, const memory* to, MklPrimitive* op) {
-      std::string key = CreateKey(from, to);
+      string key = CreateKey(from, to);
       this->SetOp(key, op);
     }
 };
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index a5c1fda1024ee835fa65b3f2ffabbed7bb3461cc..2117042034b0fe89804f1f7dd3ed48ff663f3992 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -133,7 +133,6 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
 
   int64 first_node_start_us =
       step_stats.dev_stats(0).node_stats(0).all_start_micros();
-  std::map<std::string, Detail> details;
 
   int node_num = 0;
   for (const auto& ds : step_stats.dev_stats()) {
@@ -177,22 +176,15 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       ++node_num;
       const int64 curr_time = ns.all_end_rel_micros();
       curr_total_us += curr_time;
-      auto result = details.emplace(name, Detail());
       auto output_result =
           outputs_.emplace(name, std::vector<TensorDescription>());
       std::vector<TensorDescription>* outputs = &(output_result.first->second);
-      Detail* detail = &(result.first->second);
 
-      detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us);
-      detail->rel_end_us.UpdateStat(curr_time);
+      int64_t start_us = (ns.all_start_micros() - first_node_start_us);
+      int64_t rel_end_us = curr_time;
 
       // If this is the first pass, initialize some values.
-      if (result.second) {
-        detail->name = name;
-        detail->type = op_type;
-
-        detail->run_order = node_num;
-
+      if (output_result.second) {
         outputs->resize(ns.output_size());
         for (const auto& output : ns.output()) {
           const int32 slot = output.slot();
@@ -202,7 +194,6 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
           }
           (*outputs)[slot] = output.tensor_description();
         }
-        detail->times_called = 0;
       }
 
       int64 curr_node_mem = 0;
@@ -210,11 +201,10 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
         const int64 mem_usage = mem.total_bytes();
         curr_node_mem += mem_usage;
       }
-      detail->mem_used.UpdateStat(curr_node_mem);
-      mem_total += curr_node_mem;
+      stats_calculator_->AddNodeStats(name, op_type, node_num, start_us,
+                                      rel_end_us, curr_node_mem);
 
-      ++detail->times_called;
-      stats_calculator_->UpdateDetails(details);
+      mem_total += curr_node_mem;
 
       Validate(outputs, ns);
     }
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index c4befbdb84fec9b09d345a344f249ec758883409..eb077546501327c62aff5c9d68eb5d0ba1c9aa1c 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -272,9 +272,24 @@ std::string StatsCalculator::GetOutputString() const {
   return stream.str();
 }
 
-void StatsCalculator::UpdateDetails(
-    const std::map<std::string, Detail>& details) {
-  details_.insert(details.begin(), details.end());
+void StatsCalculator::AddNodeStats(const std::string& name,
+                                   const std::string& type, int64_t run_order,
+                                   int64_t start_us, int64_t rel_end_us,
+                                   int64_t mem_used) {
+  Detail* detail = nullptr;
+  if (details_.find(name) == details_.end()) {
+    details_.insert({name, {}});
+    detail = &details_.at(name);
+    detail->type = type;
+    detail->name = name;
+    detail->run_order = run_order;
+  } else {
+    detail = &details_.at(name);
+  }
+  detail->start_us.UpdateStat(start_us);
+  detail->rel_end_us.UpdateStat(rel_end_us);
+  detail->mem_used.UpdateStat(mem_used);
+  detail->times_called++;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index 39cef816f1987ca1d3e2e9f3bcb4aab58894c321..e191737bb2c8eb85518e51b3a06884a7983a392e 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -163,7 +163,10 @@ class StatsCalculator {
   };
 
   const std::map<std::string, Detail>& GetDetails() const { return details_; }
-  void UpdateDetails(const std::map<std::string, Detail>& details);
+
+  void AddNodeStats(const std::string& name, const std::string& type,
+                    int64_t run_order, int64_t start_us, int64_t rel_end_us,
+                    int64_t mem_used);
 
  private:
   void OrderNodesByMetric(SortingMetric sorting_metric,
diff --git a/tensorflow/core/util/stats_calculator_test.cc b/tensorflow/core/util/stats_calculator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..00d7bfc2f9566c1a0836372dcd1077bbfaa43a1b
--- /dev/null
+++ b/tensorflow/core/util/stats_calculator_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/stats_calculator.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using Detail = StatsCalculator::Detail;
+
+TEST(StatsCalculatorTest, TotalTimeMs) {
+  auto options = StatSummarizerOptions();
+  StatsCalculator calc(options);
+
+  EXPECT_EQ(0, calc.num_runs());
+  calc.UpdateRunTotalUs(1);
+
+  EXPECT_EQ(1, calc.num_runs());
+  calc.UpdateRunTotalUs(2);
+
+  EXPECT_EQ(2, calc.num_runs());
+  auto run_time_us = calc.run_total_us();
+  EXPECT_EQ(1, run_time_us.min());
+  EXPECT_FLOAT_EQ(1.5, run_time_us.avg());
+}
+
+TEST(StatsCalculatorTest, AddNodeStatsUpdate) {
+  auto options = StatSummarizerOptions();
+  StatsCalculator calc(options);
+  EXPECT_TRUE(calc.GetDetails().empty());
+
+  const int64_t node1_run_order = 1;
+  const int64_t run1_start_us = 1;
+  const int64_t run1_end_us = 2;
+  const int64_t run1_mem_used = 45;
+  calc.AddNodeStats("node1", "type_1", node1_run_order, run1_start_us,
+                    run1_end_us, run1_mem_used);
+  ASSERT_EQ(1, calc.GetDetails().size());
+  const Detail& detail = calc.GetDetails().at("node1");
+  EXPECT_EQ(1, detail.times_called);
+  EXPECT_EQ("node1", detail.name);
+  EXPECT_EQ("type_1", detail.type);
+  EXPECT_EQ(node1_run_order, detail.run_order);
+
+  const int64_t run2_start_us = 3;
+  const int64_t run2_end_us = 5;
+  const int64_t run2_mem_used = 145;
+  calc.AddNodeStats("node1", "type_1", node1_run_order, run2_start_us,
+                    run2_end_us, run2_mem_used);
+  EXPECT_EQ(1, calc.GetDetails().size());
+
+  EXPECT_EQ(2, detail.times_called);
+  EXPECT_EQ("node1", detail.name);
+  EXPECT_EQ("type_1", detail.type);
+  EXPECT_EQ(node1_run_order, detail.run_order);
+
+  EXPECT_EQ(run1_start_us + run2_start_us, detail.start_us.sum());
+  EXPECT_EQ(run1_end_us + run2_end_us, detail.rel_end_us.sum());
+  EXPECT_EQ(run1_mem_used + run2_mem_used, detail.mem_used.sum());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/docs_src/guide/debugger.md b/tensorflow/docs_src/guide/debugger.md
index 8d78fe6fbd2c0390db3b3dfc29bb7a619aa43df7..f0e465214e0b8fc5e2dabd0a31b9830f77c26bb9 100644
--- a/tensorflow/docs_src/guide/debugger.md
+++ b/tensorflow/docs_src/guide/debugger.md
@@ -780,7 +780,7 @@ sess.run(b)
 ``` python
 import numpy as np
 
-a = tf.Variable(np.ones[10], name="a")
+a = tf.Variable(np.ones(10), name="a")
 b = tf.add(a, a, name="b")
 sess = tf.Session()
 sess.run(tf.global_variables_initializer())
diff --git a/tensorflow/docs_src/guide/eager.md b/tensorflow/docs_src/guide/eager.md
index 42ad9652f8c2927e304b233105fdc9c0a83e9af9..3b54d6d2bbc15180fee96a8d3be4537905740667 100644
--- a/tensorflow/docs_src/guide/eager.md
+++ b/tensorflow/docs_src/guide/eager.md
@@ -504,13 +504,13 @@ with tf.device("gpu:0"):
 
 ### Object-based saving
 
-`tfe.Checkpoint` can save and restore `tf.Variable`s to and from
+`tf.train.Checkpoint` can save and restore `tf.Variable`s to and from
 checkpoints:
 
 ```py
 x = tf.Variable(10.)
 
-checkpoint = tfe.Checkpoint(x=x)  # save as "x"
+checkpoint = tf.train.Checkpoint(x=x)  # save as "x"
 
 x.assign(2.)   # Assign a new value to the variables and save.
 save_path = checkpoint.save('./ckpt/')
@@ -523,18 +523,18 @@ checkpoint.restore(save_path)
 print(x)  # => 2.0
 ```
 
-To save and load models, `tfe.Checkpoint` stores the internal state of objects,
+To save and load models, `tf.train.Checkpoint` stores the internal state of objects,
 without requiring hidden variables. To record the state of a `model`,
-an `optimizer`, and a global step, pass them to a `tfe.Checkpoint`:
+an `optimizer`, and a global step, pass them to a `tf.train.Checkpoint`:
 
 ```py
 model = MyModel()
 optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
 checkpoint_dir = ‘/path/to/model_dir’
 checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
-root = tfe.Checkpoint(optimizer=optimizer,
-                      model=model,
-                      optimizer_step=tf.train.get_or_create_global_step())
+root = tf.train.Checkpoint(optimizer=optimizer,
+                           model=model,
+                           optimizer_step=tf.train.get_or_create_global_step())
 
 root.save(file_prefix=checkpoint_prefix)
 # or
@@ -824,7 +824,7 @@ gives you eager's interactive experimentation and debuggability with the
 distributed performance benefits of graph execution.
 
 Write, debug, and iterate in eager execution, then import the model graph for
-production deployment. Use `tfe.Checkpoint` to save and restore model
+production deployment. Use `tf.train.Checkpoint` to save and restore model
 variables, this allows movement between eager and graph execution environments.
 See the examples in:
 [tensorflow/contrib/eager/python/examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples).
diff --git a/tensorflow/docs_src/guide/keras.md b/tensorflow/docs_src/guide/keras.md
index 1d846df1044cd7100c083aa7d6b5be8f9cdd584e..2330fa03c7401c91d588f47d7d62484c09a73c5f 100644
--- a/tensorflow/docs_src/guide/keras.md
+++ b/tensorflow/docs_src/guide/keras.md
@@ -467,13 +467,13 @@ JSON and YAML serialization formats:
 json_string = model.to_json()
 
 # Recreate the model (freshly initialized)
-fresh_model = keras.models.from_json(json_string)
+fresh_model = keras.models.model_from_json(json_string)
 
 # Serializes a model to YAML format
 yaml_string = model.to_yaml()
 
 # Recreate the model
-fresh_model = keras.models.from_yaml(yaml_string)
+fresh_model = keras.models.model_from_yaml(yaml_string)
 ```
 
 Caution: Subclassed models are not serializable because their architecture is
diff --git a/tensorflow/docs_src/guide/saved_model.md b/tensorflow/docs_src/guide/saved_model.md
index acc3d3ca0b74f4898523e4af0452f65463d8b94f..717488e7cc2643d03394f01ff6e18963fae80e31 100644
--- a/tensorflow/docs_src/guide/saved_model.md
+++ b/tensorflow/docs_src/guide/saved_model.md
@@ -2,9 +2,8 @@
 
 The @{tf.train.Saver} class provides methods to save and restore models. The
 @{tf.saved_model.simple_save} function is an easy way to build a
-@{tf.saved_model$saved model} suitable for serving.
-[Estimators](@{$guide/estimators}) automatically save and restore
-variables in the `model_dir`.
+@{tf.saved_model$saved model} suitable for serving. [Estimators](./estimators)
+automatically save and restore variables in the `model_dir`.
 
 ## Save and restore variables
 
diff --git a/tensorflow/docs_src/guide/using_gpu.md b/tensorflow/docs_src/guide/using_gpu.md
index c429ca4750753278e4736650a08fd0c71e0d9fad..c0218fd12e1f5d69d667e50472fa75ed394e9318 100644
--- a/tensorflow/docs_src/guide/using_gpu.md
+++ b/tensorflow/docs_src/guide/using_gpu.md
@@ -143,7 +143,7 @@ If the device you have specified does not exist, you will get
 ```
 InvalidArgumentError: Invalid argument: Cannot assign a device to node 'b':
 Could not satisfy explicit device specification '/device:GPU:2'
-   [[Node: b = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
+   [[{{node b}} = Const[dtype=DT_FLOAT, value=Tensor<type: float shape: [3,2]
    values: 1 2 3...>, _device="/device:GPU:2"]()]]
 ```
 
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 4e1c32f9722e95568989a6eff18f8cdb4c0e3a72..cf869e86552f609ae1f4f1c426168d8ddcc5ea90 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 162a820f22138bc5bde6f90cc67b22d1fa5c59f4..4ec7e42773bb1b1221f6bbd4c1f575424eacb6ac 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index c196bb9b31f9b507cc3c453eebf391b0b12c1caa..c5f760d2542c09f743199ae219115c84e62cf578 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.9.0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.9.0-rc0</version>
+                 <version>1.9.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.9.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.9.0-rc0</version>
+  <version>1.9.0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0.zip).
   3. Extract this .zip file.
 
 __Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 7534d0fac118f713995f264db43fc08e4336c11b..3a9a01c57ec7e5906109502cd933910b0f6e20d3 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -65,7 +65,7 @@ We *recommend* using `pip` version 8.1 or higher. If using a release before
 version 8.1, upgrade `pip`:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo pip install -U pip</code>
+  <code class="devsite-terminal">pip install --upgrade pip</code>
 </pre>
 
 If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
@@ -102,7 +102,7 @@ When the Virtualenv is activated, the shell prompt displays as `(venv) $`.
 Within the active virtual environment, upgrade `pip`:
 
 <pre class="prettyprint lang-bsh">
-(venv)$ pip install -U pip
+(venv)$ pip install --upgrade pip
 </pre>
 
 You can install other Python packages within the virtual environment without
@@ -120,7 +120,7 @@ Choose one of the available TensorFlow packages for installation:
 Within an active Virtualenv environment, use `pip` to install the package:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">pip install -U tensorflow</code>
+  <code class="devsite-terminal">pip install --upgrade tensorflow</code>
 </pre>
 
 Use `pip list` to show the packages installed in the virtual environment.
@@ -198,7 +198,7 @@ We *recommend* using `pip` version 8.1 or higher. If using a release before
 version 8.1, upgrade `pip`:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo pip install -U pip</code>
+  <code class="devsite-terminal">pip install --upgrade pip</code>
 </pre>
 
 If not using Ubuntu and [setuptools](https://pypi.org/project/setuptools/) is
@@ -220,8 +220,8 @@ Choose one of the available TensorFlow packages for installation:
 And use `pip` to install the package for Python 2 or 3:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo pip install -U tensorflow   # Python 2.7</code>
-  <code class="devsite-terminal">sudo pip3 install -U tensorflow  # Python 3.n</code>
+  <code class="devsite-terminal">pip install --upgrade --user tensorflow   # Python 2.7</code>
+  <code class="devsite-terminal">pip3 install --upgrade --user tensorflow  # Python 3.n</code>
 </pre>
 
 Use `pip list` to show the packages installed on the system.
@@ -239,8 +239,8 @@ If the above steps failed, try installing the TensorFlow binary using the remote
 URL of the `pip` package:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo pip install --upgrade <var>remote-pkg-URL</var>   # Python 2.7</code>
-  <code class="devsite-terminal">sudo pip3 install --upgrade <var>remote-pkg-URL</var>  # Python 3.n</code>
+  <code class="devsite-terminal">pip install --user --upgrade <var>remote-pkg-URL</var>   # Python 2.7</code>
+  <code class="devsite-terminal">pip3 install --user --upgrade <var>remote-pkg-URL</var>  # Python 3.n</code>
 </pre>
 
 The <var>remote-pkg-URL</var> depends on the operating system, Python version,
@@ -255,8 +255,8 @@ encounter problems.
 To uninstall TensorFlow on your system, use one of following commands:
 
 <pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo pip uninstall tensorflow   # for Python 2.7</code>
-  <code class="devsite-terminal">sudo pip3 uninstall tensorflow  # for Python 3.n</code>
+  <code class="devsite-terminal">pip uninstall tensorflow   # for Python 2.7</code>
+  <code class="devsite-terminal">pip3 uninstall tensorflow  # for Python 3.n</code>
 </pre>
 
 <a name="InstallingDocker"></a>
@@ -436,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 
@@ -650,13 +650,13 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0-cp27-none-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -667,13 +667,13 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -684,13 +684,13 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -701,13 +701,13 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 3372e9e1e08208fe9994853f1add82860eb13be8..1a7b2b815d101e1dca87a2dd987441a0b51f636a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -517,7 +517,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0-py2-none-any.whl
 </pre>
 
 
@@ -525,5 +525,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 502f4de7a69d2e5b6828e93a34482438b59b89f0..31dcad64d43bc9cef46839db050b88944f3375fb 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -223,46 +223,90 @@ will likely differ from our sample input:
 <pre>
 $ <b>cd tensorflow</b>  # cd to the top-level directory created
 $ <b>./configure</b>
+You have bazel 0.15.0 installed.
 Please specify the location of python. [Default is /usr/bin/python]: <b>/usr/bin/python2.7</b>
+
+
 Found possible Python library paths:
   /usr/local/lib/python2.7/dist-packages
   /usr/lib/python2.7/dist-packages
 Please input the desired Python library path to use.  Default is [/usr/lib/python2.7/dist-packages]
 
-Using python library path: /usr/local/lib/python2.7/dist-packages
-Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
-Do you wish to use jemalloc as the malloc implementation? [Y/n]
-jemalloc enabled
-Do you wish to build TensorFlow with Google Cloud Platform support? [y/N]
-No Google Cloud Platform support will be enabled for TensorFlow
-Do you wish to build TensorFlow with Hadoop File System support? [y/N]
-No Hadoop File System support will be enabled for TensorFlow
-Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N]
-No XLA support will be enabled for TensorFlow
-Do you wish to build TensorFlow with VERBS support? [y/N]
-No VERBS support will be enabled for TensorFlow
-Do you wish to build TensorFlow with OpenCL support? [y/N]
-No OpenCL support will be enabled for TensorFlow
-Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
-CUDA support will be enabled for TensorFlow
-Do you want to use clang as CUDA compiler? [y/N]
-nvcc will be used as CUDA compiler
+Do you wish to build TensorFlow with jemalloc as malloc support? [Y/n]:
+jemalloc as malloc support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Google Cloud Platform support? [Y/n]:
+Google Cloud Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Hadoop File System support? [Y/n]:
+Hadoop File System support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Amazon AWS Platform support? [Y/n]:
+Amazon AWS Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with Apache Kafka Platform support? [Y/n]:
+Apache Kafka Platform support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with XLA JIT support? [y/N]:
+No XLA JIT support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with GDR support? [y/N]:
+No GDR support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with VERBS support? [y/N]:
+No VERBS support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with OpenCL SYCL support? [y/N]:
+No OpenCL SYCL support will be enabled for TensorFlow.
+
+Do you wish to build TensorFlow with CUDA support? [y/N]: <b>Y</b>
+CUDA support will be enabled for TensorFlow.
+
 Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+
+
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
-Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
+
+
+Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7.0</b>
+
+
 Please specify the location where cuDNN 7 library is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
-Please specify a list of comma-separated CUDA compute capabilities you want to build with.
+
+
+Do you wish to build TensorFlow with TensorRT support? [y/N]:
+No TensorRT support will be enabled for TensorFlow.
+
+Please specify the NCCL version you want to use. If NCLL 2.2 is not installed, then you can use version 1.3 that can be fetched automatically but it may have worse performance with multiple GPUs. [Default is 2.2]: 1.3
+
+
+Please specify a list of comma-separated Cuda compute capabilities you want to build with.
 You can find the compute capability of your device at: https://developer.nvidia.com/cuda-gpus.
-Please note that each additional compute capability significantly increases your build time and binary size.
+Please note that each additional compute capability significantly increases your
+build time and binary size. [Default is: 3.5,7.0] <b>6.1</b>
+
+
+Do you want to use clang as CUDA compiler? [y/N]:
+nvcc will be used as CUDA compiler.
 
-Do you wish to build TensorFlow with MPI support? [y/N]
-MPI support will not be enabled for TensorFlow
+Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
+
+
+Do you wish to build TensorFlow with MPI support? [y/N]:
+No MPI support will be enabled for TensorFlow.
+
+Please specify optimization flags to use during compilation when bazel option "--config=opt" is specified [Default is -march=native]:
+
+
+Would you like to interactively configure ./WORKSPACE for Android builds? [y/N]:
+Not configuring the WORKSPACE for Android builds.
+
+Preconfigured Bazel build configs. You can use any of the below by adding "--config=<>" to your build command. See tools/bazel.rc for more details.
+    --config=mkl            # Build with MKL support.
+    --config=monolithic     # Config for mostly static monolithic build.
 Configuration finished
 </pre>
 
-[Default is: "3.5,7.0"]: <b>6.0,7.0</b>
-
 If you told `configure` to build for GPU support, then `configure` will create a
 canonical set of symbolic links to the CUDA libraries on your system. Therefore,
 every time you change the CUDA library paths, you must rerun the `configure`
@@ -330,10 +374,10 @@ Invoke `pip install` to install that pip package. The filename of the `.whl`
 file depends on your platform. For example, the following command will install
 the pip package
 
-for TensorFlow 1.9.0rc0 on Linux:
+for TensorFlow 1.9.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
diff --git a/tensorflow/docs_src/mobile/README.md b/tensorflow/docs_src/mobile/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ecf42672654ab4a8d2ea8c9bb4752ed65d6c8a9a
--- /dev/null
+++ b/tensorflow/docs_src/mobile/README.md
@@ -0,0 +1,3 @@
+# TF Lite subsite
+
+This subsite directory lives in [tensorflow/contrib/lite/g3doc](../../contrib/lite/g3doc/).
diff --git a/tensorflow/docs_src/mobile/index.md b/tensorflow/docs_src/mobile/index.md
deleted file mode 100644
index 6032fcad020fcd6c2e52c863c5c992028eed4ff3..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/mobile/index.md
+++ /dev/null
@@ -1,33 +0,0 @@
-# Overview
-
-TensorFlow was designed to be a good deep learning solution for mobile
-platforms. Currently we have two solutions for deploying machine learning
-applications on mobile and embedded devices:
-@{$mobile/mobile_intro$TensorFlow for Mobile} and @{$mobile/tflite$TensorFlow Lite}.
-
-## TensorFlow Lite versus TensorFlow Mobile
-
-Here are a few of the differences between the two:
-
-- TensorFlow Lite is an evolution of TensorFlow Mobile.  In most cases, apps
-  developed with TensorFlow Lite will have a smaller binary size, fewer
-  dependencies, and better performance.
-
-- TensorFlow Lite supports only a limited set of operators, so not all models
-  will work on it by default. TensorFlow for Mobile has a fuller set of
-  supported functionality.
-
-TensorFlow Lite provides better performance and a small binary size on mobile
-platforms as well as the ability to leverage hardware acceleration if available
-on their platforms. In addition, it has many fewer dependencies so it can be
-built and hosted on simpler, more constrained device scenarios. TensorFlow Lite
-also allows targeting accelerators through the [Neural Networks
-API](https://developer.android.com/ndk/guides/neuralnetworks/index.html).
-
-TensorFlow Lite currently has coverage for a limited set of operators. While
-TensorFlow for Mobile supports only a constrained set of ops by default, in
-principle if you use an arbitrary operator in TensorFlow, it can be customized
-to build that kernel. Thus use cases which are not currently supported by
-TensorFlow Lite should continue to use TensorFlow for Mobile. As TensorFlow Lite
-evolves, it will gain additional operators, and the decision will be easier to
-make.
diff --git a/tensorflow/docs_src/mobile/leftnav_files b/tensorflow/docs_src/mobile/leftnav_files
deleted file mode 100644
index 97340ef7e1af64634f8590b5d21a344b5181cb73..0000000000000000000000000000000000000000
--- a/tensorflow/docs_src/mobile/leftnav_files
+++ /dev/null
@@ -1,15 +0,0 @@
-index.md
-### TensorFlow Lite
-tflite/index.md
-tflite/devguide.md
-tflite/demo_android.md
-tflite/demo_ios.md
-tflite/performance.md
->>>
-### TensorFlow Mobile
-mobile_intro.md
-android_build.md
-ios_build.md
-linking_libs.md
-prepare_models.md
-optimizing.md
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index cb0f5ca9242098d06aa0a9898e4a3774fab527b8..dafacbe37974f80c85131509824956ea1c5c8426 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -464,7 +464,7 @@ equal to the number of physical cores rather than logical cores.
   config = tf.ConfigProto()
   config.intra_op_parallelism_threads = 44
   config.inter_op_parallelism_threads = 44
-  tf.session(config=config)
+  tf.Session(config=config)
 
 ```
 
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index eaa709c2f84245341044b93060f932a22fbe54c7..7018ded53f8bc078a43b6af54a9ba13796374458 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -99,7 +99,7 @@ dimensions 1 and 2 of the cuboid.
 
 This type of broadcast is used in the binary ops in `XlaBuilder`, if the
 `broadcast_dimensions` argument is given. For example, see
-[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.cc).
+[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.cc).
 In the XLA source code, this type of broadcasting is sometimes called "InDim"
 broadcasting.
 
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index 68c427a31661e26adf2d0d599a5d9eb4a18f57be..5f7482f90f1c9312f23aa299a5592ade830cb984 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -1,7 +1,7 @@
 # Operation Semantics
 
 The following describes the semantics of operations defined in the
-[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 interface. Typically, these operations map one-to-one to operations defined in
 the RPC interface in
 [`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
@@ -16,7 +16,7 @@ and familiar names; for example a *vector* is a 1-dimensional array and a
 ## BatchNormGrad
 
 See also
-[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -80,7 +80,7 @@ The output type is a tuple of three handles:
 ## BatchNormInference
 
 See also
-[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -115,7 +115,7 @@ The output is an n-dimensional, normalized array with the same shape as input
 ## BatchNormTraining
 
 See also
-[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -167,7 +167,7 @@ spatial dimensions using the formulas above.
 ## BitcastConvertType
 
 See also
-[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
@@ -189,7 +189,7 @@ and destination element types must not be tuples.
 ## Broadcast
 
 See also
-[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Adds dimensions to an array by duplicating the data in the array.
 
@@ -217,7 +217,7 @@ For example, if `operand` is a scalar `f32` with value `2.0f`, and
 ## Call
 
 See also
-[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Invokes a computation with the given arguments.
 
@@ -236,7 +236,7 @@ The arity and types of the `args` must match the parameters of the
 ## Clamp
 
 See also
-[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Clamps an operand to within the range between a minimum and maximum value.
 
@@ -269,7 +269,7 @@ Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ## Collapse
 
 See also
-[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and the @{tf.reshape} operation.
 
 Collapses dimensions of an array into one dimension.
@@ -332,7 +332,7 @@ then v12 == f32[8x3] {{10, 11, 12},
 ## Concatenate
 
 See also
-[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Concatenate composes an array from multiple array operands. The array is of the
 same rank as each of the input array operands (which must be of the same rank as
@@ -388,7 +388,7 @@ Diagram:
 ## Conditional
 
 See also
-[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Conditional(pred, true_operand, true_computation, false_operand,
 false_computation)` </b>
@@ -416,7 +416,7 @@ executed depending on the value of `pred`.
 ## Conv (convolution)
 
 See also
-[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
 either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
@@ -426,7 +426,7 @@ account. VALID padding simply means no padding.
 ## ConvWithGeneralPadding (convolution)
 
 See also
-[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
@@ -538,7 +538,7 @@ for (b, oz, oy, ox) {  // output coordinates
 ## ConvertElementType
 
 See also
-[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Similar to an element-wise `static_cast` in C++, performs an element-wise
 conversion operation from a data shape to a target shape. The dimensions must
@@ -572,7 +572,7 @@ then b == f32[3]{0.0, 1.0, 2.0}
 ## CrossReplicaSum
 
 See also
-[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Computes a sum across replicas.
 
@@ -607,7 +607,7 @@ than another.
 ## CustomCall
 
 See also
-[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Call a user-provided function within a computation.
 
@@ -668,7 +668,7 @@ idempotent.
 ## Dot
 
 See also
-[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Dot(lhs, rhs)` </b>
 
@@ -697,7 +697,7 @@ multiplications or matrix/matrix multiplications.
 ## DotGeneral
 
 See also
-[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
@@ -784,15 +784,13 @@ non-contracting/non-batch dimension.
 ## DynamicSlice
 
 See also
-[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 DynamicSlice extracts a sub-array from the input array at dynamic
 `start_indices`. The size of the slice in each dimension is passed in
 `size_indices`, which specify the end point of exclusive slice intervals in each
 dimension: [start, start + size). The shape of `start_indices` must be rank ==
 1, with dimension size equal to the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
@@ -812,6 +810,17 @@ calculation of 'start_indices') is currently implementation-defined.
 :                 :                     : dimension to avoid wrapping modulo  :
 :                 :                     : dimension size.                     :
 
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - size_indices[i])
+```
+
+This ensures that the extracted slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
 1-dimensional example:
 
 ```
@@ -839,7 +848,7 @@ DynamicSlice(b, s, {2, 2}) produces:
 ## DynamicUpdateSlice
 
 See also
-[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 DynamicUpdateSlice generates a result which is the value of the input array
 `operand`, with a slice `update` overwritten at `start_indices`.
@@ -847,8 +856,6 @@ The shape of `update` determines the shape of the sub-array of the result which
 is updated.
 The shape of `start_indices` must be rank == 1, with dimension size equal to
 the rank of `operand`.
-Note: handling of out-of-bounds slice indices (generated by incorrect runtime
-calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
@@ -866,6 +873,17 @@ calculation of 'start_indices') is currently implementation-defined.
 :                 :         : dimension. Value must be greater than or equal   :
 :                 :         : to zero.                                         :
 
+The effective slice indices are computed by applying the following
+transformation for each index `i` in `[1, N)` before performing the slice:
+
+```
+start_indices[i] = clamp(start_indices[i], 0, operand.dimension_size[i] - update.dimension_size[i])
+```
+
+This ensures that the updated slice is always in-bounds with respect to the
+operand array. If the slice is in-bounds before the transformation is applied,
+the transformation has no effect.
+
 1-dimensional example:
 
 ```
@@ -902,7 +920,7 @@ DynamicUpdateSlice(b, u, s) produces:
 ## Element-wise binary arithmetic operations
 
 See also
-[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A set of element-wise binary arithmetic operations is supported.
 
@@ -947,7 +965,7 @@ shapes of both operands. The semantics are described in detail on the
 ## Element-wise comparison operations
 
 See also
-[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A set of standard element-wise binary comparison operations is supported. Note
 that standard IEEE 754 floating-point comparison semantics apply when comparing
@@ -1033,7 +1051,7 @@ potentially different runtime offset) of an input tensor into an output tensor.
 ### General Semantics
 
 See also
-[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
 <b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
@@ -1236,7 +1254,7 @@ concatenation of all these rows.
 ## GetTupleElement
 
 See also
-[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Indexes into a tuple with a compile-time-constant value.
 
@@ -1257,7 +1275,7 @@ See also @{tf.tuple}.
 ## Infeed
 
 See also
-[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Infeed(shape)` </b>
 
@@ -1293,17 +1311,30 @@ Infeed of the device.
 > which case the compiler will provide information about how the Infeed
 > operations are serialized in the compiled program.
 
+## Iota
+
+<b> `Iota()` </b>
+
+Builds a constant literal on device rather than a potentially large host
+transfer.  Creates a rank 1 tensor of values starting at zero and incrementing
+by one.
+
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`type`             | `PrimitiveType` | type U
+`size`             | `int64`         | The number of elements in the tensor.
+
 ## Map
 
 See also
-[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Map(operands..., computation)` </b>
 
 | Arguments         | Type                   | Semantics                      |
 | ----------------- | ---------------------- | ------------------------------ |
 | `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
-| `computation`     | `XlaComputation`        | computation of type `T_0, T_1, |
+| `computation`     | `XlaComputation`       | computation of type `T_0, T_1, |
 :                   :                        : ..., T_{N + M -1} -> S` with N :
 :                   :                        : parameters of type T and M of  :
 :                   :                        : arbitrary type                 :
@@ -1325,7 +1356,7 @@ input arrays to produce the output array.
 ## Pad
 
 See also
-[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Pad(operand, padding_value, padding_config)` </b>
 
@@ -1364,7 +1395,7 @@ are all 0. The figure below shows examples of different `edge_padding` and
 ## Recv
 
 See also
-[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Recv(shape, channel_handle)` </b>
 
@@ -1398,7 +1429,7 @@ complete and returns the received data.
 ## Reduce
 
 See also
-[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Applies a reduction function to an array.
 
@@ -1515,7 +1546,7 @@ Reducing the 3D array over all its dimensions produces the scalar `84`.
 ## ReducePrecision
 
 See also
-[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Models the effect of converting floating-point values to a lower-precision
 format (such as IEEE-FP16) and back to the original format.  The number of
@@ -1546,7 +1577,7 @@ portion of the conversion is then simply a no-op.
 ## ReduceWindow
 
 See also
-[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
@@ -1629,7 +1660,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Reshape
 
 See also
-[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h)
 and the [`Collapse`](#collapse) operation.
 
 Reshapes the dimensions of an array into a new configuration.
@@ -1710,7 +1741,7 @@ Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
 ## Rev (reverse)
 
 See also
-[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b>`Rev(operand, dimensions)`</b>
 
@@ -1732,7 +1763,7 @@ the two window dimensions during the gradient computation in neural networks.
 ## RngNormal
 
 See also
-[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
@@ -1752,7 +1783,7 @@ be scalar valued.
 ## RngUniform
 
 See also
-[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the uniform distribution over the interval $$[a,b)$$. The parameters and output
@@ -1773,7 +1804,7 @@ is implementation-defined.
 ## Select
 
 See also
-[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Constructs an output array from elements of two input arrays, based on the
 values of a predicate array.
@@ -1824,7 +1855,7 @@ the same shape!) then `pred` has to be a scalar of type `PRED`.
 ## SelectAndScatter
 
 See also
-[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 This operation can be considered as a composite operation that first computes
 `ReduceWindow` on the `operand` array to select an element from each window, and
@@ -1904,7 +1935,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Send
 
 See also
-[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `Send(operand, channel_handle)` </b>
 
@@ -1959,7 +1990,7 @@ computations. For example, below schedules lead to deadlocks.
 ## Slice
 
 See also
-[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 Slicing extracts a sub-array from the input array. The sub-array is of the same
 rank as the input and contains the values inside a bounding box within the input
@@ -2008,7 +2039,7 @@ Slice(b, {2, 1}, {4, 3}) produces:
 ## Sort
 
 See also
-[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 There are two versions of the Sort instruction: a single-operand and a
 two-operand version.
@@ -2068,7 +2099,7 @@ This is the same as Reshape(operand, permutation,
 ## Tuple
 
 See also
-[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 A tuple containing a variable number of data handles, each of which has its own
 shape.
@@ -2087,7 +2118,7 @@ Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
 ## While
 
 See also
-[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
+[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
 
 <b> `While(condition, body, init)` </b>
 
diff --git a/tensorflow/docs_src/tutorials/_index.yaml b/tensorflow/docs_src/tutorials/_index.yaml
index c74fe580894a18202d427dbaa83d97981b105e37..953411468978846724b52bae73537e80694a78ee 100644
--- a/tensorflow/docs_src/tutorials/_index.yaml
+++ b/tensorflow/docs_src/tutorials/_index.yaml
@@ -2,6 +2,7 @@ project_path: /_project.yaml
 book_path: /_book.yaml
 description: <!--no description-->
 landing_page:
+  custom_css_path: /site-assets/css/style.css
   show_side_navs: True
   rows:
   - description: >
@@ -14,57 +15,6 @@ landing_page:
       </p>
     items:
     - custom_html: >
-        <style>
-        .tfo-button-primary {
-          background-color: #fca851;
-        }
-        .tfo-button-primary:hover {
-          background-color: #ef6c02;
-        }
-
-        a.colab-button {
-          display: inline-block;
-          background: rgba(255, 255, 255, 0.75);
-          padding: 4px 8px;
-          border-radius: 4px;
-          font-size: 11px!important;
-          text-decoration: none;
-          color:#aaa;border: none;
-          font-weight: 300;
-          border: solid 1px rgba(0, 0, 0, 0.08);
-          border-bottom-color: rgba(0, 0, 0, 0.15);
-          text-transform: uppercase;
-          line-height: 16px
-        }
-        a.colab-button:hover {
-          color: #666;
-          background: white;
-          border-color: rgba(0, 0, 0, 0.2);
-        }
-        a.colab-button span {
-          background-image: url("/images/colab_logo_button.svg");
-          background-repeat:no-repeat;background-size:20px;
-          background-position-y:2px;display:inline-block;
-          padding-left:24px;border-radius:4px;
-          text-decoration:none;
-        }
-
-        /* adjust code block for smaller screens */
-        @media screen and (max-width: 1000px) {
-          .tfo-landing-row-item-code-block {
-            flex-direction: column !important;
-          }
-          .tfo-landing-row-item-code-block > .devsite-landing-row-item-code {
-            /*display: none;*/
-            width: 100%;
-          }
-        }
-        @media screen and (max-width: 720px) {
-          .tfo-landing-row-item-code-block {
-            display: none;
-          }
-        }
-        </style>
         <div class="devsite-landing-row-item-description">
           <h3 class="hide-from-toc">Learn and use ML</h3>
           <div class="devsite-landing-row-item-description-content">
diff --git a/tensorflow/examples/tutorials/mnist/mnist_deep.py b/tensorflow/examples/tutorials/mnist/mnist_deep.py
index 1e0294db27bc675870afceca77a2cdcd4b3f5ad3..5d8d8d84fe26c0a3ec69791885f3c7ce5e0fba15 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_deep.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_deep.py
@@ -34,6 +34,8 @@ from tensorflow.examples.tutorials.mnist import input_data
 
 import tensorflow as tf
 
+import numpy
+
 FLAGS = None
 
 
@@ -164,8 +166,15 @@ def main(_):
         print('step %d, training accuracy %g' % (i, train_accuracy))
       train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
 
-    print('test accuracy %g' % accuracy.eval(feed_dict={
-        x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
+    # compute in batches to avoid OOM on GPUs 
+    accuracy_l = []
+    for _ in range(20):
+      batch = mnist.test.next_batch(500, shuffle=False)
+      accuracy_l.append(accuracy.eval(feed_dict={x: batch[0], 
+                                                 y_: batch[1], 
+                                                 keep_prob: 1.0}))
+    print('test accuracy %g' % numpy.mean(accuracy_l))
+
 
 if __name__ == '__main__':
   parser = argparse.ArgumentParser()
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index e251356ec8e97311affaf752c0a515be97013fa8..288a32530a7ed2f4d773912591907395c82db34e 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -46,7 +46,7 @@ from source.
     ```sh
     cd ${GOPATH}/src/github.com/tensorflow/tensorflow
     ./configure
-    bazel build --config opt //tensorflow:libtensorflow.so
+    bazel build -c opt //tensorflow:libtensorflow.so
     ```
 
     This can take a while (tens of minutes, more if also building for GPU).
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 18d74253235bea076a3a0d261af42b1080ad8d05..6c9bf1e71480e15c0e316e3e5fc1f8a1ae3a83a2 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -5225,12 +5225,26 @@ func IsBoostedTreesEnsembleInitialized(scope *Scope, tree_ensemble_handle tf.Out
 	return op.Output(0)
 }
 
+// CastAttr is an optional argument to Cast.
+type CastAttr func(optionalAttr)
+
+// CastTruncate sets the optional Truncate attribute to value.
+// If not specified, defaults to false
+func CastTruncate(value bool) CastAttr {
+	return func(m optionalAttr) {
+		m["Truncate"] = value
+	}
+}
+
 // Cast x of type SrcT to y of DstT.
-func Cast(scope *Scope, x tf.Output, DstT tf.DataType) (y tf.Output) {
+func Cast(scope *Scope, x tf.Output, DstT tf.DataType, optional ...CastAttr) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"DstT": DstT}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "Cast",
 		Input: []tf.Input{
diff --git a/tensorflow/java/maven/hadoop/pom.xml b/tensorflow/java/maven/hadoop/pom.xml
index 7391dfb965c7af4fbd54a6fad69dde435a40812d..2c2c4106cb9ada6a4f3b8217792f8ad6fd248871 100644
--- a/tensorflow/java/maven/hadoop/pom.xml
+++ b/tensorflow/java/maven/hadoop/pom.xml
@@ -5,7 +5,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>hadoop</artifactId>
     <packaging>jar</packaging>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <name>tensorflow-hadoop</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord InputFormat/OutputFormat for Apache Hadoop</description>
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index d44bdf8f81f83c0d61a1a499c83481f2a74d0998..5d4e04ecd3b884dccbcad0bd7d89dc4b5dece593 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index e8925c6fb18c19ca65cff8a4b02239d0e2edd915..e107904f7da861f3c499cec555bebd7671344dd9 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 3bf4a2590cbe365bb3fd6972e00ddc5189758f01..b3c525233f50f86244ae629b8e58b0eb9d7fe465 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index b96dcf2888e4f4e587b27d246099d7d93b416645..a2943a317239da50e27c28514e587c2eb3e9dd3f 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.9.0</version>
+  <version>1.10.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 5581d864d70e63d7121f6db912bae85f89cfdc9a..7080d81b7d2ee1969f01282a97d6000eb0f8e7b5 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/spark-connector/pom.xml b/tensorflow/java/maven/spark-connector/pom.xml
index 64956be02c18464b10bfee5fd8c3bc83007c30c9..003d09a0b718874a320cdf9157ad69d0a095332b 100644
--- a/tensorflow/java/maven/spark-connector/pom.xml
+++ b/tensorflow/java/maven/spark-connector/pom.xml
@@ -6,7 +6,7 @@
     <groupId>org.tensorflow</groupId>
     <artifactId>spark-connector_2.11</artifactId>
     <packaging>jar</packaging>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <name>spark-tensorflow-connector</name>
     <url>https://www.tensorflow.org</url>
     <description>TensorFlow TFRecord connector for Apache Spark DataFrames</description>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 92e15aa2c7082f03fb542eba8b2bf222174e8ea2..b9affbf6997d5526d4691c0244cf1c751ea7d1a7 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.9.0</version>
+    <version>1.10.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c33a579ad28684b51d80ec864b7dbecde1f97b6e..d35731d3cd49611b10dde881fa126f0f6fd674e2 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -96,6 +96,7 @@ py_library(
         ":image_ops",
         ":initializers_ns",
         ":io_ops",
+        ":kernels",
         ":layers",
         ":lib",
         ":list_ops",
@@ -745,8 +746,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":framework",
+        ":framework_ops",
         ":function",
-        ":op_def_registry",
         ":tensor_shape",
         ":versions",
         "//tensorflow/core:protos_all_py",
@@ -762,8 +763,10 @@ py_test(
     deps = [
         ":array_ops",
         ":client_testlib",
+        ":constant_op",
         ":dtypes",
         ":framework_ops",
+        ":function",
         ":function_def_to_graph",
         ":graph_to_function_def",
         ":math_ops",
@@ -787,6 +790,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "kernels",
+    srcs = [
+        "framework/kernels.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
 py_library(
     name = "op_def_library",
     srcs = ["framework/op_def_library.py"],
@@ -1480,6 +1496,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "framework_kernels_test",
+    size = "small",
+    srcs = ["framework/kernels_test.py"],
+    main = "framework/kernels_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_test_lib",
+        ":kernels",
+        ":platform_test",
+        ":test_ops",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "array_ops_gen",
     visibility = [
@@ -3377,6 +3407,7 @@ py_test(
         ":math_ops",
         ":util",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -3627,6 +3658,7 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_ref",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
     ],
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index e037925961f2bfc8b8906fa81c2d7908ea590a62..f8e20e1b8934782c4d65bd75ec8ab53c86723851 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import re
 import threading
@@ -243,7 +244,7 @@ class _FetchMapper(object):
     elif isinstance(fetch, (list, tuple)):
       # NOTE(touts): This is also the code path for namedtuples.
       return _ListFetchMapper(fetch)
-    elif isinstance(fetch, dict):
+    elif isinstance(fetch, collections.Mapping):
       return _DictFetchMapper(fetch)
     else:
       # Look for a handler in the registered expansions.
@@ -540,10 +541,11 @@ class _DeviceAttributes(object):
         (in bytes).
   """
 
-  def __init__(self, name, device_type, memory_limit_bytes):
+  def __init__(self, name, device_type, memory_limit_bytes, incarnation):
     self._name = device.canonical_name(name)
     self._device_type = device_type
     self._memory_limit_bytes = memory_limit_bytes
+    self._incarnation = incarnation
 
   @property
   def name(self):
@@ -557,11 +559,16 @@ class _DeviceAttributes(object):
   def memory_limit_bytes(self):
     return self._memory_limit_bytes
 
+  @property
+  def incarnation(self):
+    return self._incarnation
+
   def __repr__(self):
-    return '_DeviceAttributes(%s, %s, %d)' % (
+    return '_DeviceAttributes(%s, %s, %d, %d)' % (
         self.name,
         self.device_type,
         self.memory_limit_bytes,
+        self.incarnation,
     )
 
 
@@ -623,7 +630,7 @@ class BaseSession(SessionInterface):
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
       # pylint: disable=protected-access
-      self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+      self._session = tf_session.TF_NewSessionRef(self._graph._c_graph, opts)
       # pylint: enable=protected-access
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
@@ -658,7 +665,9 @@ class BaseSession(SessionInterface):
       name = tf_session.TF_DeviceListName(raw_device_list, i)
       device_type = tf_session.TF_DeviceListType(raw_device_list, i)
       memory = tf_session.TF_DeviceListMemoryBytes(raw_device_list, i)
-      device_list.append(_DeviceAttributes(name, device_type, memory))
+      incarnation = tf_session.TF_DeviceListIncarnation(raw_device_list, i)
+      device_list.append(
+          _DeviceAttributes(name, device_type, memory, incarnation))
     tf_session.TF_DeleteDeviceList(raw_device_list)
     return device_list
 
@@ -1226,8 +1235,12 @@ class BaseSession(SessionInterface):
 
       return _fetch_handler_run
 
-  # Captures the name of a node in an error status.
-  _NODEDEF_NAME_RE = re.compile(r'\[\[Node: ([^ ]*?) =')
+  # Captures the name of a node in an error status. The regex below matches
+  # both the old and the new formats:
+  # Old format: [[Node: <node_name> = ...]]
+  # New format: [[{{node <node_name>}} = ...]]
+  _NODEDEF_NAME_RE = re.compile(
+      r'\[\[(Node: )?(\{\{node )?([^\} ]*)(\}\})?\s*=')
 
   def _do_run(self, handle, target_list, fetch_list, feed_dict, options,
               run_metadata):
@@ -1282,7 +1295,7 @@ class BaseSession(SessionInterface):
       node_def = None
       op = None
       if m is not None:
-        node_name = m.group(1)
+        node_name = m.group(3)
         try:
           op = self._graph.get_operation_by_name(node_name)
           node_def = op.node_def
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index c5d82c213ac890ac4c968eba506695c3a2ce93c4..dd381c689fde31531668d83441b5ee92bd1ab9ec 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -37,6 +37,8 @@ class SessionListDevicesTest(test_util.TensorFlowTestCase):
       devices = sess.list_devices()
       self.assertTrue('/job:localhost/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
+      # All valid device incarnations must be non-zero.
+      self.assertTrue(all(d.incarnation != 0 for d in devices))
 
   def testInvalidDeviceNumber(self):
     opts = tf_session.TF_NewSessionOptions()
@@ -54,6 +56,8 @@ class SessionListDevicesTest(test_util.TensorFlowTestCase):
       devices = sess.list_devices()
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in set(
           [d.name for d in devices]), devices)
+      # All valid device incarnations must be non-zero.
+      self.assertTrue(all(d.incarnation != 0 for d in devices))
 
   def testListDevicesClusterSpecPropagation(self):
     server1 = server_lib.Server.create_local_server()
@@ -67,11 +71,13 @@ class SessionListDevicesTest(test_util.TensorFlowTestCase):
     config = config_pb2.ConfigProto(cluster_def=cluster_def)
     with session.Session(server1.target, config=config) as sess:
       devices = sess.list_devices()
-      device_names = set([d.name for d in devices])
+      device_names = set(d.name for d in devices)
       self.assertTrue(
           '/job:worker/replica:0/task:0/device:CPU:0' in device_names)
       self.assertTrue(
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
+      # All valid device incarnations must be non-zero.
+      self.assertTrue(all(d.incarnation != 0 for d in devices))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index b72e029d1ccb688f5992f6cc8695969be5e5e2e3..052be683856beb41ab572e808c260817b05ef5ae 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -35,6 +35,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as framework_device_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -104,18 +105,20 @@ class SessionTest(test_util.TensorFlowTestCase):
           copy_val)
 
   def testManyCPUs(self):
-    # TODO(keveman): Implement ListDevices and test for the number of
-    # devices returned by ListDevices.
     with session.Session(
         config=config_pb2.ConfigProto(device_count={
-            'CPU': 2
-        })):
+            'CPU': 2, 'GPU': 0
+        })) as sess:
       inp = constant_op.constant(10.0, name='W1')
       self.assertAllEqual(inp.eval(), 10.0)
 
+      devices = sess.list_devices()
+      self.assertEqual(2, len(devices))
+      for device in devices:
+        self.assertEqual('CPU', framework_device_lib.DeviceSpec.from_string(
+            device.name).device_type)
+
   def testPerSessionThreads(self):
-    # TODO(keveman): Implement ListDevices and test for the number of
-    # devices returned by ListDevices.
     with session.Session(
         config=config_pb2.ConfigProto(use_per_session_threads=True)):
       inp = constant_op.constant(10.0, name='W1')
@@ -1868,19 +1871,21 @@ class SessionTest(test_util.TensorFlowTestCase):
 
   def testDeviceAttributes(self):
     attrs = session._DeviceAttributes(
-        '/job:worker/replica:0/task:3/device:CPU:2', 'TYPE', 1337)
+        '/job:worker/replica:0/task:3/device:CPU:2', 'TYPE', 1337, 1000000)
     self.assertEqual(1337, attrs.memory_limit_bytes)
     self.assertEqual('/job:worker/replica:0/task:3/device:CPU:2', attrs.name)
     self.assertEqual('TYPE', attrs.device_type)
+    self.assertEqual(1000000, attrs.incarnation)
     str_repr = '%s' % attrs
     self.assertTrue(str_repr.startswith('_DeviceAttributes'), str_repr)
 
   def testDeviceAttributesCanonicalization(self):
     attrs = session._DeviceAttributes('/job:worker/replica:0/task:3/cpu:1',
-                                      'TYPE', 1337)
+                                      'TYPE', 1337, 1000000)
     self.assertEqual(1337, attrs.memory_limit_bytes)
     self.assertEqual('/job:worker/replica:0/task:3/device:CPU:1', attrs.name)
     self.assertEqual('TYPE', attrs.device_type)
+    self.assertEqual(1000000, attrs.incarnation)
     str_repr = '%s' % attrs
     self.assertTrue(str_repr.startswith('_DeviceAttributes'), str_repr)
 
diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i
index 985cb904360ac293461936bf67fb1b1de2c77b4a..39a2922ac0e54367f454c36921a029a9a7d7e82e 100644
--- a/tensorflow/python/client/tf_session.i
+++ b/tensorflow/python/client/tf_session.i
@@ -138,6 +138,11 @@ tensorflow::ImportNumpy();
   $result = PyLong_FromLongLong($1);
 }
 
+// Convert TF_DeviceListIncarnation uint64_t output to Python integer
+%typemap(out) uint64_t {
+  $result = PyLong_FromUnsignedLongLong($1);
+}
+
 // We use TF_OperationGetControlInputs_wrapper instead of
 // TF_OperationGetControlInputs
 %ignore TF_OperationGetControlInputs;
@@ -772,6 +777,7 @@ def TF_Reset(target, containers=None, config=None):
   $1 = &types_local;
 }
 
+%unignore TF_NewSessionRef;
 %unignore SetRequireShapeInferenceFns;
 %unignore TF_TryEvaluateConstant_wrapper;
 %noexception TF_TryEvaluateConstant_wrapper;
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index b6481e7e29e4057f08e1c78b310bf5581afc5411..bcd4af291282bbefda3db0309bb9f0a913f186ce 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/session_ref.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -42,6 +43,19 @@ static const char* kFeedDictErrorMsg =
     "feed_dict must be a dictionary mapping strings to NumPy arrays.";
 }  // end namespace
 
+TF_Session* TF_NewSessionRef(TF_Graph* graph, const TF_SessionOptions* opts,
+                             TF_Status* status) {
+  TF_Session* tf_session = TF_NewSession(graph, opts, status);
+  if (tf_session == nullptr) {
+    return nullptr;
+  }
+
+  Session* session = reinterpret_cast<Session*>(tf_session->session);
+  SessionRef* session_ref = new SessionRef(session);
+  tf_session->session = session_ref;
+  return tf_session;
+}
+
 void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
                            const TF_Buffer* run_options, PyObject* feed_dict,
                            const NameVector& output_names,
diff --git a/tensorflow/python/client/tf_session_helper.h b/tensorflow/python/client/tf_session_helper.h
index cfd27c2bee990ab4e2829652a532761e674ed8e0..dab7e71aac5a7f4cbf9f8825ad6dd5d3f556bd43 100644
--- a/tensorflow/python/client/tf_session_helper.h
+++ b/tensorflow/python/client/tf_session_helper.h
@@ -40,6 +40,9 @@ typedef tensorflow::gtl::InlinedVector<PyObject*, 8> PyObjectVector;
 // A TF_TensorVector is a vector of borrowed pointers to TF_Tensors.
 typedef gtl::InlinedVector<TF_Tensor*, 8> TF_TensorVector;
 
+TF_Session* TF_NewSessionRef(TF_Graph* graph, const TF_SessionOptions* opts,
+                             TF_Status* status);
+
 // Run the graph associated with the session starting with the
 // supplied inputs[].  Regardless of success or failure, inputs[] are
 // stolen by the implementation (i.e. the implementation will
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 35de2f2841604d95fa3363b0b8e194ec1723f554..494df178dfa48eaa6c864cb1c3d89f9a0cb9af43 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -57,6 +57,13 @@ GET_NEXT_CALL_WARNING_MESSAGE = (
 GLOBAL_ITERATORS = "iterators"
 
 
+def _device_stack_is_empty():
+  # pylint: disable=protected-access
+  device_stack = ops.get_default_graph()._device_functions_outer_to_inner
+  # pylint: enable=protected-access
+  return not bool(device_stack)
+
+
 @tf_export("data.Iterator")
 class Iterator(object):
   """Represents the state of iterating through a `Dataset`."""
@@ -174,7 +181,7 @@ class Iterator(object):
     if shared_name is None:
       shared_name = ""
     if compat.forward_compatible(2018, 8, 3):
-      if not ops.get_default_graph()._graph_device_function_stack:  # pylint: disable=protected-access
+      if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_v2(
               container="",
@@ -263,7 +270,7 @@ class Iterator(object):
     nest.assert_same_structure(output_types, output_shapes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
     if compat.forward_compatible(2018, 8, 3):
-      if not ops.get_default_graph()._graph_device_function_stack:  # pylint: disable=protected-access
+      if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
               string_handle,
@@ -499,7 +506,8 @@ class EagerIterator(object):
           "tf.data.Dataset.make_initializable_iterator or "
           "tf.data.Dataset.make_one_shot_iterator for graph construction".
           format(type(self)))
-    with ops.device("/device:CPU:0"):
+    self._device = context.context().device_name
+    with ops.device("/cpu:0"):
       ds_variant = dataset._as_variant_tensor()  # pylint: disable=protected-access
       self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
@@ -508,14 +516,14 @@ class EagerIterator(object):
           sparse.as_dense_types(self._output_types, self._output_classes))
       self._flat_output_shapes = nest.flatten(
           sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-      self._resource = gen_dataset_ops.anonymous_iterator(
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
-      gen_dataset_ops.make_iterator(ds_variant, self._resource)
-      # Delete the resource when this object is deleted
-      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
-          handle=self._resource, handle_device="/device:CPU:0")
-    self._device = context.context().device_name
+      with ops.colocate_with(ds_variant):
+        self._resource = gen_dataset_ops.anonymous_iterator(
+            output_types=self._flat_output_types,
+            output_shapes=self._flat_output_shapes)
+        gen_dataset_ops.make_iterator(ds_variant, self._resource)
+        # Delete the resource when this object is deleted
+        self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+            handle=self._resource, handle_device=self._device)
 
   def __iter__(self):
     return self
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..2bd0b4320afb500afad30e3d5cb0000711e1b664
--- /dev/null
+++ b/tensorflow/python/distribute/BUILD
@@ -0,0 +1,43 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "distribute_coordinator",
+    srcs = [
+        "distribute_coordinator.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:training",
+    ],
+)
+
+py_test(
+    name = "distribute_coordinator_test",
+    size = "small",
+    srcs = ["distribute_coordinator_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":distribute_coordinator",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
new file mode 100644
index 0000000000000000000000000000000000000000..04c50dbafc0c1497a86c0e4b7e64661a21af51b4
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -0,0 +1,361 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A unified and split coordinator for distributed TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import os
+import threading
+
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.training import server_lib
+
+
+class _TaskType(object):
+  PS = "ps"
+  WORKER = "worker"
+  CHIEF = "chief"
+  EVALUATOR = "evaluator"
+
+
+_coordinator_context = threading.local()
+
+
+def get_current_coordinator_context():
+  """Returns the current coordinator context."""
+  try:
+    return _coordinator_context.current
+  except AttributeError:
+    return None
+
+
+class _Barrier(object):
+  """A reusable barrier class for worker synchronization."""
+
+  def __init__(self, num_participants):
+    """Initializes the barrier object.
+
+    Args:
+      num_participants: an integer which is the expected number of calls of
+        `wait` pass to through this barrier.
+    """
+    self._num_participants = num_participants
+    self._counter = 0
+    self._flag = False
+    self._local_sense = threading.local()
+    self._lock = threading.Lock()
+    self._condition = threading.Condition()
+
+  def wait(self):
+    """Waits until all other callers reach the same wait call."""
+    if not hasattr(self._local_sense, "value"):
+      self._local_sense.value = False
+    self._local_sense.value = not self._flag
+    with self._lock:
+      self._counter += 1
+      if self._counter == self._num_participants:
+        self._counter = 0
+        self._flag = self._local_sense.value
+    with self._condition:
+      while self._flag != self._local_sense.value:
+        self._condition.wait()
+      self._condition.notify_all()
+
+
+def _get_num_workers(cluster_spec):
+  """Gets number of workers including chief."""
+  if not cluster_spec:
+    return 0
+  return len(cluster_spec.as_dict().get(_TaskType.WORKER, [])) + len(
+      cluster_spec.as_dict().get(_TaskType.CHIEF, []))
+
+
+class _CoordinatorContext(object):
+  """The coordinator context class.
+
+  This context object provides configuration information for each task. One
+  context manager with a coordinator context object will be created per
+  invocation to the `worker_fn` where `get_current_coordinator_context` can be
+  called to access the coordinator context object.
+  """
+
+  def __init__(self,
+               cluster_spec,
+               task_type,
+               task_id,
+               between_graph=False,
+               rpc_layer="grpc",
+               worker_barrier=None):
+    """Initialize the coordinator context object.
+
+    Args:
+      cluster_spec: a ClusterSpec object. It can be empty or None in the local
+        training case.
+      task_type: a string indicating the role of the corresponding task, such as
+        "worker" or "ps". It can be None if it is local training or
+        `between_graph` is False.
+      task_id: an integer indicating id of the corresponding task. It can be
+        None if it is local training or `between_graph` is False.
+      between_graph: whether it is between-graph replication or not.
+      rpc_layer: optional string specifying the RPC protocol for communication
+        with worker masters. If None or empty, hosts in the `cluster_spec` will
+        be used directly.
+      worker_barrier: optional, the barrier object for worker synchronization.
+
+    Raises:
+      ValueError: if task_type or task_id is Node or empty and it is distributed
+        between-graph replicated training.
+    """
+    if cluster_spec and between_graph:
+      if not task_type or task_id is None:
+        raise ValueError("`task_type` and `task_id` must be set in the "
+                         "distributed between-graph replicated training.")
+      if task_type not in cluster_spec.jobs:
+        raise ValueError("`task_type` %r not found in the `cluster_spec` %r" %
+                         (task_type, cluster_spec))
+    self._cluster_spec = cluster_spec
+    self._task_type = task_type
+    self._task_id = task_id
+    self._worker_barrier = worker_barrier
+    self._rpc_layer = rpc_layer
+    self._master_target = self._get_master_target()
+    self._num_workers = _get_num_workers(cluster_spec)
+    self._is_chief_node = self._is_chief()
+
+  def __enter__(self):
+    old_context = get_current_coordinator_context()
+    if old_context:
+      raise ValueError(
+          "You cannot run distribute coordinator in a `worker_fn`.")
+    _coordinator_context.current = self
+
+  def __exit__(self, unused_exception_type, unused_exception_value,
+               unused_traceback):
+    _coordinator_context.current = None
+
+  def _get_master_target(self):
+    """Return the master target for a task."""
+    # If cluster_spec is None or empty, we use local master.
+    if not self._cluster_spec:
+      return "local"
+
+    # If task_type is None, then it is in-graph replicated training. In this
+    # case we use the chief or first worker's master target.
+    if not self._task_type:
+      if _TaskType.CHIEF in self._cluster_spec.jobs:
+        assert not self.between_graph
+        task_type = _TaskType.CHIEF
+        task_id = 0
+      else:
+        assert _TaskType.WORKER in self._cluster_spec.jobs
+        task_type = _TaskType.WORKER
+        task_id = 0
+    else:
+      task_type = self._task_type
+      task_id = self._task_id
+
+    prefix = ""
+    if self._rpc_layer:
+      prefix = self._rpc_layer + "://"
+    return prefix + self._cluster_spec.job_tasks(task_type)[task_id or 0]
+
+  def _is_chief(self):
+    """Return whether the task is the chief worker."""
+    if (not self._cluster_spec or self._task_type in [_TaskType.CHIEF, None]):
+      return True
+
+    # If not local and chief not in the cluster_spec, use the first worker as
+    # chief.
+    if (_TaskType.CHIEF not in self._cluster_spec.jobs and
+        self._task_type == _TaskType.WORKER and self._task_id == 0):
+      return True
+    return False
+
+  def wait_for_other_workers(self):
+    """Waits for other workers to reach the same call to this method.
+
+    Raises:
+      ValueError: if `worker_barrier` is not passed to the __init__ method.
+    """
+    if not self._worker_barrier:
+      raise ValueError(
+          "`worker_barrier is not set in the coordinator context.`")
+    self._worker_barrier.wait()
+
+  @property
+  def distributed_mode(self):
+    """Whether it is distributed training or not."""
+    return bool(self._cluster_spec)
+
+  @property
+  def cluster_spec(self):
+    """Returns a copy of the cluster_spec object."""
+    return copy.deepcopy(self._cluster_spec)
+
+  @property
+  def task_type(self):
+    """Returns the role of the corresponing task."""
+    return self._task_type
+
+  @property
+  def task_id(self):
+    """Returns the id or index of the corresponing task."""
+    return self._task_id
+
+  @property
+  def master_target(self):
+    """Returns the session master for the corresponding task to connect to."""
+    return self._master_target
+
+  @property
+  def is_chief(self):
+    """Returns whether the task is a chief node."""
+    return self._is_chief_node
+
+  @property
+  def num_workers(self):
+    """Returns number of workers in the cluster, including chief."""
+    return self._num_workers
+
+
+def _run(worker_fn, cluster_spec, task_type, task_id, between_graph, rpc_layer,
+         worker_barrier):
+  with _CoordinatorContext(cluster_spec, task_type, task_id, between_graph,
+                           rpc_layer, worker_barrier):
+    worker_fn()
+
+
+def run_distribute_coordinator(worker_fn,
+                               cluster_spec=None,
+                               between_graph=False,
+                               rpc_layer=None):
+  """Run the coordinator for distributed TensorFlow.
+
+  This function runs a unified and split coordinator for distributed TensorFlow.
+  Given a `cluster_spec` specifying server addresses and their roles in a
+  cluster, this coordinator will figure out how to set them up, give the
+  underlying function the right targets for master sessions and coordinate their
+  training.
+
+  In addition to be the distribute coordinator, this is also the source of
+  configurations for each job in the distributed training. As there are multiple
+  ways to configure a distributed TensorFlow cluster, its context object
+  provides these configurations so that users or higher-level APIs don't have to
+  figure out the configuration for each job by themselves.
+
+  In the between-graph replicated training, this coordinator will create
+  multiple threads and each calls the `worker_fn` which is supposed to create
+  its own graph and connect to one worker master given by its coordinator
+  context. In the in-graph replicated training, it has only one thread calling
+  this `worker_fn`.
+
+  The `worker_fn` defines the training logic and is called under a its own
+  coordinator context which can be accessed to via
+  `get_current_coordinator_context`. A coordinator context provides access to
+  configurations for each task, e.g. the task_type, task_id, master target and
+  so on. Since `worker_fn` will be called in a thread and possibly multiple
+  times, caller should be careful when it accesses global data. For example, it
+  is unsafe to define flags in a `worker_fn` or to define different environment
+  variables for different `worker_fn`s.
+
+  The `worker_fn` for the between-graph replication is defined as if there are
+  only one worker corresponding to the `worker_fn` and possibly ps jobs. It
+  assigns variables to parameter servers and all other operations to that
+  worker. In the in-graph replication case, the `worker_fn` has to define
+  operations for all worker jobs. Using a distribution strategy can simplify the
+  `worker_fn` by not having to worry about the replication and device assignment
+  of variables and operations.
+
+  This method is intended to be invoked by high-level APIs so that users don't
+  have to explictly call it to run this coordinator. For those who don't use
+  high-level APIs, to change a program to use this coordinator, wrap everything
+  in a the program after global data definitions such as commandline flag
+  definition into the `worker_fn` and get task-specific configurations from
+  the coordinator context.
+
+  The `cluster_spec` can be either passed by the argument or parsed from the
+  "TF_CONFIG" envrionment variable. Example of a TF_CONFIG:
+  ```
+    cluster = {'chief': ['host0:2222'],
+               'ps': ['host1:2222', 'host2:2222'],
+               'worker': ['host3:2222', 'host4:2222', 'host5:2222']}
+    os.environ['TF_CONFIG'] = json.dumps({'cluster': cluster})
+  ```
+
+  If `cluster_spec` is not given in any format, it becomes local training and
+  this coordinator will connect to a local session.
+
+  For evaluation, if "evaluator" exist in the cluster_spec, a separate thread
+  will be created with its `task_type` set to "evaluator". If "evaluator" is not
+  set in the cluster_spec, it entirely depends on the `worker_fn` for how to do
+  evaluation.
+
+  Args:
+    worker_fn: the function to be called and given the access to a coordinator
+      context object.
+    cluster_spec: a dict, ClusterDef or ClusterSpec specifying servers and roles
+      in a cluster. If not set or empty, fall back to local training.
+    between_graph: a boolean. It is only useful when `cluster_spec` is set and
+      not empty. If true, it will use between-graph replicated training;
+      otherwise it will use in-graph replicated training.
+    rpc_layer: optional string, the protocol for RPC, e.g. "grpc".
+
+  Raises:
+    ValueError: if `cluster_spec` is supplied but not a dict or a ClusterDef or
+      a ClusterSpec.
+  """
+  if not cluster_spec:
+    tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
+    cluster_spec = tf_config.get("cluster", {})
+
+  if cluster_spec:
+    if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
+      cluster_spec = server_lib.ClusterSpec(cluster_spec)
+    elif not isinstance(cluster_spec, server_lib.ClusterSpec):
+      raise ValueError(
+          "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+          "`tf.train.ClusterDef` object")
+    # TODO(yuefengz): validate cluster_spec.
+
+  threads = []
+  if cluster_spec and _TaskType.EVALUATOR in cluster_spec.jobs:
+    t = threading.Thread(
+        target=_run,
+        args=(worker_fn, cluster_spec, _TaskType.EVALUATOR, 0, between_graph,
+              rpc_layer, None))
+    t.start()
+    threads.append(t)
+
+  if cluster_spec and between_graph:
+    worker_barrier = _Barrier(_get_num_workers(cluster_spec))
+    for task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
+      for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+        t = threading.Thread(
+            target=_run,
+            args=(worker_fn, cluster_spec, task_type, task_id, between_graph,
+                  rpc_layer, worker_barrier))
+        t.start()
+        threads.append(t)
+  else:
+    # Local or in-graph replicated training.
+    _run(worker_fn, cluster_spec, None, None, between_graph, rpc_layer, None)
+
+  # TODO(yuefengz): wrapper threads into thread coordinator?
+  for t in threads:
+    t.join()
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..82fd823352c03b941a32c8d50510a8d142b466a2
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -0,0 +1,291 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for distribute coordinator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+import threading
+import six
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.distribute import distribute_coordinator
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+CHIEF = distribute_coordinator._TaskType.CHIEF
+WORKER = distribute_coordinator._TaskType.WORKER
+PS = distribute_coordinator._TaskType.PS
+EVALUATOR = distribute_coordinator._TaskType.EVALUATOR
+
+NUM_WORKERS = 3
+NUM_PS = 2
+
+
+def _bytes_to_str(maybe_bytes):
+  if isinstance(maybe_bytes, six.string_types):
+    return maybe_bytes
+  else:
+    return str(maybe_bytes, "utf-8")
+
+
+class DistributeCoordinatorTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    # We have to create a global in-process cluster because once an in-process
+    # tensorflow server is created, there is no way to terminate it. Please see
+    # multi_worker_test_base.py for more details.
+    cls._workers, cls._ps = test_util.create_local_cluster(
+        NUM_WORKERS, num_ps=NUM_PS)
+    cls._cluster_spec = {
+        WORKER: [_bytes_to_str(w.target) for w in cls._workers],
+        PS: [_bytes_to_str(ps.target) for ps in cls._ps]
+    }
+
+  def setUp(self):
+    self._result_correct = 0
+    self._lock = threading.Lock()
+    self._task_context = {}
+
+  @contextlib.contextmanager
+  def _test_session(self, target):
+    config = config_pb2.ConfigProto(allow_soft_placement=True)
+    config.graph_options.optimizer_options.opt_level = -1
+    with session.Session(graph=None, config=config, target=target) as sess:
+      yield sess
+
+  def _in_graph_worker_fn(self):
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    with self._test_session(target=context.master_target) as sess:
+      xs = []
+      expected = 0.0
+      for i in range(context.num_workers):
+        with ops.device("/job:worker/task:%d" % i):
+          x = variable_scope.get_variable("x_%d" % i, initializer=10.0)
+          x_add = x.assign_add(float(i))
+          xs.append(x_add)
+          expected += i + 10.0
+
+      with ops.device("/job:worker/task:0"):
+        result = math_ops.add_n(xs)
+
+      variables.global_variables_initializer().run()
+      result_value = sess.run(result)
+    self.assertEqual(result_value, expected)
+    if result_value == expected:
+      self._result_correct += 1
+
+  def testInGraph(self):
+    """Test it runs in-graph replicated training correctly."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._in_graph_worker_fn,
+        cluster_spec=self._cluster_spec,
+        between_graph=False)
+    self.assertEqual(self._result_correct, 1)
+
+  def _between_graph_worker_fn(self):
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    with self._test_session(target=context.master_target) as sess:
+      with ops.device("/job:ps/task:0"):
+        # TODO(yuefengz): investigate why not using resource variable will make
+        # the test flaky.
+        x = variable_scope.get_variable(
+            "x", initializer=10.0, use_resource=True)
+      with ops.device("/job:ps/task:1"):
+        y = variable_scope.get_variable(
+            "y", initializer=20.0, use_resource=True)
+
+      x_add = x.assign_add(2.0)
+      y_sub = y.assign_sub(2.0)
+      train_op = control_flow_ops.group([x_add, y_sub])
+
+      if context.is_chief:
+        variables.global_variables_initializer().run()
+
+      # Synchronize workers after initializaton.
+      context.wait_for_other_workers()
+
+      sess.run(train_op)
+
+      # Synchronize workers after one step to make sure they all have finished
+      # training.
+      context.wait_for_other_workers()
+
+      x_val, y_val = sess.run([x, y])
+
+      self.assertEqual(x_val, 16.0)
+      self.assertEqual(y_val, 14.0)
+      if x_val == 16.0 and y_val == 14.0:
+        with self._lock:
+          self._result_correct += 1
+
+  def testBetweenGraph(self):
+    """Test it runs between-graph replicated training correctly."""
+    distribute_coordinator.run_distribute_coordinator(
+        self._between_graph_worker_fn,
+        cluster_spec=self._cluster_spec,
+        between_graph=True)
+
+    # Each finished worker will increment self._result_correct.
+    self.assertEqual(self._result_correct, NUM_WORKERS)
+
+  def _dump_task_context(self):
+    """Dumps the propoerties of each coordinator context.
+
+    It dumps the context properties to a dict mapping from task_type to a list
+    of tuples of master_target, num_workers, is_chief and distribute_mode, where
+    the list is indexed by the task_id.
+    """
+    context = distribute_coordinator.get_current_coordinator_context()
+    self.assertTrue(context is not None)
+    task_type = str(context.task_type)
+    task_id = context.task_id or 0
+    with self._lock:
+      if task_type not in self._task_context:
+        self._task_context[task_type] = []
+      while len(self._task_context[task_type]) <= task_id:
+        self._task_context[task_type].append(None)
+      self._task_context[task_type][task_id] = (context.master_target,
+                                                context.num_workers,
+                                                context.is_chief,
+                                                context.distributed_mode)
+
+  def testBetweenGraphContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=self._cluster_spec,
+        between_graph=True)
+
+    # There is only one type of task and there three such tasks.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue(WORKER in self._task_context)
+    self.assertEqual(len(self._task_context[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._task_context[WORKER][0],
+        (_bytes_to_str(self._workers[0].target), NUM_WORKERS, True, True))
+    self.assertEqual(
+        self._task_context[WORKER][1],
+        (_bytes_to_str(self._workers[1].target), NUM_WORKERS, False, True))
+    self.assertEqual(
+        self._task_context[WORKER][2],
+        (_bytes_to_str(self._workers[2].target), NUM_WORKERS, False, True))
+
+  def testInGraphContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=self._cluster_spec,
+        between_graph=False)
+
+    # There is only a "None" task in the dumped task context.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue("None" in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(
+        self._task_context["None"][0],
+        (_bytes_to_str(self._workers[0].target), NUM_WORKERS, True, True))
+
+  def testLocalContext(self):
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context, cluster_spec=None, between_graph=True)
+
+    # There is only a "None" task.
+    self.assertEqual(len(self._task_context), 1)
+    self.assertTrue("None" in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context["None"][0], ("local", 0, True, False))
+
+  def testBetweenGraphContextWithChief(self):
+    # Adds a chief node, so there are NUM_WORKERS + 1 workers in total.
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    cluster_spec[CHIEF] = ["fake_chief"]
+
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context,
+        cluster_spec=cluster_spec,
+        between_graph=True,
+        rpc_layer="grpc")
+
+    # There are one CHIEF and three workers.
+    self.assertEqual(len(self._task_context), 2)
+    self.assertTrue(CHIEF in self._task_context)
+    self.assertTrue(WORKER in self._task_context)
+    self.assertEqual(len(self._task_context[CHIEF]), 1)
+    self.assertEqual(len(self._task_context[WORKER]), NUM_WORKERS)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context[CHIEF][0],
+                     ("grpc://fake_chief", 4, True, True))
+    self.assertEqual(self._task_context[WORKER][0],
+                     ("grpc://" + _bytes_to_str(self._workers[0].target),
+                      NUM_WORKERS + 1, False, True))
+    self.assertEqual(self._task_context[WORKER][1],
+                     ("grpc://" + _bytes_to_str(self._workers[1].target),
+                      NUM_WORKERS + 1, False, True))
+    self.assertEqual(self._task_context[WORKER][2],
+                     ("grpc://" + _bytes_to_str(self._workers[2].target),
+                      NUM_WORKERS + 1, False, True))
+
+  def testInGraphContextWithEval(self):
+    # Adds a EVALUATOR job.
+    cluster_spec = copy.deepcopy(self._cluster_spec)
+    cluster_spec[EVALUATOR] = ["fake_evaluator"]
+
+    # Dumps the task contexts to the self._task_context dict.
+    distribute_coordinator.run_distribute_coordinator(
+        self._dump_task_context, cluster_spec=cluster_spec, between_graph=False)
+
+    # There are one "None" task and one EVALUATOR task.
+    self.assertEqual(len(self._task_context), 2)
+    self.assertTrue("None" in self._task_context)
+    self.assertTrue(EVALUATOR in self._task_context)
+    self.assertEqual(len(self._task_context["None"]), 1)
+    self.assertEqual(len(self._task_context[EVALUATOR]), 1)
+
+    # Check whether each task has the right master_target, num_workers, is_chief
+    # and distributed_mode.
+    self.assertEqual(self._task_context["None"][0],
+                     (_bytes_to_str(self._workers[0].target), 3, True, True))
+    self.assertEqual(self._task_context[EVALUATOR][0],
+                     ("fake_evaluator", 3, False, True))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 6ede8e4f4d9c549faae3223d400d25b7712bbc74..32a8452f620fb54a658fe0a60cb3213ddcb1c61d 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -322,6 +322,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -404,6 +405,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
     ],
     tags = [
         "optonly",  # The test is too slow in non-opt mode
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 9e0bbce4a15ddf8b9aff848a94ae6e7773d18373..c59ad09bf1f0fbae093ce360ce3d0f544d933d6e 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -599,15 +599,18 @@ def _fast_fill(value, shape, dtype):
 
 
 def _zeros(shape, dtype):
-  """Wraps array_ops.zeros to cache last zero for a given shape and dtype."""
-  device = context.context().device_name
+  """Helper to return (possibly cached) zero tensors in eager mode."""
   if dtype == dtypes.variant:
     # TODO(apassos): need to save enough information about variant tensors to do
     # a zeros
     return None
-  # pylint: disable=protected-access
-  cache_key = shape, dtype, device, context.context()._eager_context.mode
-  # pylint: enable=protected-access
+
+  ctx = context.context()
+  if not ctx.executing_eagerly():
+    return array_ops.zeros(shape, dtype)
+
+  device = ctx.device_name
+  cache_key = shape, dtype, device
   cached = _zeros_cache.get(cache_key)
   if cached is None:
     cached = _fast_fill(0, shape, dtype)
@@ -616,6 +619,9 @@ def _zeros(shape, dtype):
 
 
 def _ones(shape, dtype):
+  if not context.context().executing_eagerly():
+    return array_ops.ones(shape, dtype)
+
   if shape == ():  # pylint: disable=g-explicit-bool-comparison
     return constant_op.constant(1, dtype=dtype)
   return _fast_fill(1, shape, dtype)
@@ -643,10 +649,10 @@ class GradientTape(object):
   Operations are recorded if they are executed within this context manager and
   at least one of their inputs is being "watched".
 
-  Trainable variables (created by `tf.contrib.eager.Variable` or
-  @{tf.get_variable}, trainable=True is default in both cases) are automatically
-  watched. Tensors can be manually watched by invoking the `watch` method on
-  this context manager.
+  Trainable variables (created by `tf.Variable` or @{tf.get_variable},
+  trainable=True is default in both cases) are automatically watched. Tensors
+  can be manually watched by invoking the `watch` method on this context
+  manager.
 
   For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
   be computed as:
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index bdda200ff641d03bae40924cdc738bbf7cd60c4e..3d3f54b9c468fa6e47838a2d440c0330651402da 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -96,6 +96,19 @@ class BackpropTest(test.TestCase):
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
+  def testGradientInsideLoop(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      def body(_):
+        _ = v + 1.0  # This reads the variable inside the loop context
+        with backprop.GradientTape() as t:
+          result = v * 2
+        self.assertTrue(t.gradient(result, v) is not None)
+        return 1.0
+
+      control_flow_ops.while_loop(lambda i: False, body, [1.0])
+
   def testWhereGradient(self):
     # Note: where is special because only some of its arguments are of
     # differentiable dtypes.
@@ -912,32 +925,23 @@ class BackpropTest(test.TestCase):
         'did you forget to return a value from fn?'):
       val_and_grads_fn(x, y)
 
-  def testZerosCacheDoesntLeakAcrossModes(self):
-    with ops.Graph().as_default():
-      t = random_ops.random_normal(shape=[100, 2])
-      x = random_ops.random_normal(shape=[100, 4])
-      dy = random_ops.random_normal(shape=[100, 4])
-      with backprop.GradientTape() as gradient_tape:
-        gradient_tape.watch(x)
-        x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
-        y1 = x1 ** 2.
-        y = array_ops.concat([y1, t], axis=1)
-
-      dx = gradient_tape.gradient(y, x, output_gradients=dy)
-      with self.test_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(dx)
-
-    t = random_ops.random_normal(shape=[100, 2])
-    x = random_ops.random_normal(shape=[100, 4])
-    dy = random_ops.random_normal(shape=[100, 4])
-    with backprop.GradientTape() as gradient_tape:
-      gradient_tape.watch(x)
-      x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
-      y1 = x1 ** 2.
-      y = array_ops.concat([y1, t], axis=1)
-
-    dx = gradient_tape.gradient(y, x, output_gradients=dy)
+  def testZerosCacheDoesntLeakAcrossGraphs(self):
+    with context.graph_mode():
+      def get_grad():
+        with ops.Graph().as_default(), self.test_session():
+          t = constant_op.constant(1, dtype=dtypes.float32, shape=(10, 4))
+          x = constant_op.constant(2, dtype=dtypes.float32, shape=(10, 4))
+          with backprop.GradientTape() as gt:
+            tape.watch(x)
+            x1, _ = array_ops.split(x, num_or_size_splits=2, axis=1)
+            y1 = x1**2
+            y = array_ops.concat([y1, t], axis=1)
+          return self.evaluate(gt.gradient(y, x))
+
+      grad1 = get_grad()
+      grad2 = get_grad()
+
+      self.assertAllEqual(grad1, grad2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3aad4a114a710280b5046666256b6b43dc0d5523..afc4bf006679cbcd50ec36b1883a1b38c993bebb 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -31,6 +31,7 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import keras
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import context
@@ -70,6 +71,25 @@ def c_tfe_py_fastpath_execute(a,
     six.raise_from(core._status_to_exception(e.code, message), None)
 
 
+class SubclassedKerasModel(keras.Model):
+
+  def __init__(self):
+    super(SubclassedKerasModel, self).__init__()
+    self.layer = keras.layers.Dense(
+        10, kernel_initializer="ones", bias_initializer="zeros")
+
+  def call(self, x):
+    return self.layer(x)
+
+
+def make_keras_model():
+  x = keras.Input(shape=(10,))
+  y = keras.layers.Dense(
+      10, kernel_initializer="ones", bias_initializer="zeros")(
+          x)
+  return keras.Model(inputs=x, outputs=y)
+
+
 class MicroBenchmarks(test.Benchmark):
 
   def __init__(self):
@@ -115,6 +135,7 @@ class MicroBenchmarks(test.Benchmark):
 
     def func():
       ops.EagerTensor(value, context=handle, device=device, dtype=dtype)
+
     self._run(func, 30000)
 
   def benchmark_create_float_tensor_from_list_CPU(self):
@@ -211,8 +232,8 @@ class MicroBenchmarks(test.Benchmark):
     inputs = [m]
 
     def f():
-      pywrap_tensorflow.TFE_Py_Execute(
-          ctx_handle, None, "Identity", inputs, attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, None, "Identity", inputs,
+                                       attrs, 1)
 
     self._run(f, 30000)
 
@@ -234,14 +255,13 @@ class MicroBenchmarks(test.Benchmark):
     def f():
       with backprop.GradientTape():
         pass
+
     self._run(f, 30000)
 
   def benchmark_tf_gradient_function_no_op(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
-      self._run(
-          lambda: backprop.gradients_function(lambda x: x, [0])(m),
-          30000)
+      self._run(lambda: backprop.gradients_function(lambda x: x, [0])(m), 30000)
 
   def _benchmark_np_matmul(self, m, transpose_b, num_iters):
     a = m.cpu().numpy()
@@ -255,6 +275,7 @@ class MicroBenchmarks(test.Benchmark):
     self._run(func, num_iters, execution_mode=execution_mode)
 
   def _benchmark_gen_math_ops_matmul(self, m, transpose_b, num_iters):
+
     def func():
       gen_math_ops.mat_mul(m, m, transpose_b=transpose_b)
 
@@ -276,9 +297,10 @@ class MicroBenchmarks(test.Benchmark):
     device = context.context().device_name
     attrs = ("transpose_a", False, "transpose_b", transpose_b, "T",
              m.dtype.as_datatype_enum)
+
     def func():
-      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul",
-                                       inputs, attrs, 1)
+      pywrap_tensorflow.TFE_Py_Execute(ctx_handle, device, "MatMul", inputs,
+                                       attrs, 1)
 
     self._run(func, num_iters)
 
@@ -542,6 +564,30 @@ class MicroBenchmarks(test.Benchmark):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
+  def benchmark_keras_model_subclassed(self):
+    model = SubclassedKerasModel()
+    data = random_ops.random_uniform((10, 10))
+
+    func = lambda: model(data)
+    # First call is more expensive (creates variables etc.), discount that.
+    func()
+
+    # The whole point of this test is to contrast subclassing with
+    # the functional style of keras model building, so validate that
+    # the models are equivalent.
+    assert np.equal(func(), make_keras_model()(data)).all()
+
+    self._run(func, 30000)
+
+  def benchmark_keras_model_functional(self):
+    model = make_keras_model()
+    data = random_ops.random_uniform((10, 10))
+    func = lambda: model(data)
+    # Symmetry with benchmark_keras_model_subclassed
+    func()
+    assert np.equal(func(), SubclassedKerasModel()(data)).all()
+    self._run(func, 30000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 85b9491903de2ea6ffe1c5ac7ef76efdfda2818b..495a674526fa231a3a8595d5d84ac8f6660b207f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -177,6 +177,11 @@ class Context(object):
         - tf.contrib.eager.SYNC: executes each operation synchronously.
         - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
           operations may return "non-ready" handles.
+      server_def: (Optional.) A tensorflow::ServerDef proto.
+        Enables execution on remote devices. GrpcServers need to be started by
+        creating an identical server_def to this, and setting the appropriate
+        task_indexes, so that the servers can communicate. It will then be
+        possible to execute operations on remote devices.
 
     Raises:
      ValueError: If execution_mode is not valid.
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 3fabe7060e980423268eb6f52ab4043cc4a4847c..cc765725a48631f0c50662dedf7fe7af7b30f9a3 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -610,6 +610,14 @@ class TFETest(test_util.TensorFlowTestCase):
       self.assertEquals(typ, dtypes.float32)
       self.assertIsInstance(t, ops.EagerTensor)
 
+  def testConvertMixedEagerTensorsWithVariables(self):
+    var = resource_variable_ops.ResourceVariable(1.0)
+    types, tensors = execute_lib.convert_to_mixed_eager_tensors(
+        ['foo', var], context.context())
+    self.assertAllEqual([dtypes.string, dtypes.float32], types)
+    for t in tensors:
+      self.assertIsInstance(t, ops.EagerTensor)
+
 
 class SendRecvTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/execute.py b/tensorflow/python/eager/execute.py
index 2ff5b8d8f489731c14d8abb81652a17026ed4935..f9b8d2cb5db9aedcd834afcde00dac3afa4008bb 100644
--- a/tensorflow/python/eager/execute.py
+++ b/tensorflow/python/eager/execute.py
@@ -198,11 +198,7 @@ def args_to_matching_eager(l, ctx, default_dtype=None):
 
 
 def convert_to_mixed_eager_tensors(values, ctx):
-  v = [
-      t if isinstance(t, ops.EagerTensor) else ops.EagerTensor(
-          t, context=ctx._handle, device=ctx.device_name)  # pylint: disable=protected-access
-      for t in values
-  ]
+  v = [ops.internal_convert_to_tensor(t, ctx=ctx) for t in values]
   types = [t._datatype_enum() for t in v]  # pylint: disable=protected-access
   return types, v
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index a6906f9efdb698205f2ef28245c9cdce5cd01537..5e4f9e29da0324248acc31131f3ecc03b5edfeee 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -470,37 +470,39 @@ class GraphModeFunction(object):
 
   def _construct_backprop_function(self):
     """Constructs the backprop function object for this function."""
-    with self._graph.as_default():
-      c_known_ops = set()
-      c_captured_tensors = set()
-
-      existing_op_len = len(self._graph.get_operations())
-      filtered_outputs = [x for x in self._python_returns if x is not None]
+    filtered_outputs = [x for x in self._python_returns if x is not None]
+    captures = {}
+    backwards_graph = CapturingGraph(captures)
+    backwards_graph._graph_key = self._graph._graph_key  # pylint: disable=protected-access
+    for collection in self._graph.collections:
+      backwards_graph.get_collection_ref(
+          collection)[:] = self._graph.get_collection(collection)
+    backwards_graph.seed = self._graph.seed
+    with backwards_graph.as_default():
       self._out_grad_placeholders = [
           graph_placeholder(x.dtype, x.shape) for x in filtered_outputs]
-      in_gradients = gradients_impl.gradients(
+      in_gradients = gradients_impl._GradientsHelper(  # pylint: disable=protected-access
           filtered_outputs,
           self._input_placeholders,
-          grad_ys=self._out_grad_placeholders)
-      for op in self._graph.get_operations()[existing_op_len:]:
-        if op.type in ["Variable", "VariableV2", "VarHandleOp"]:
-          raise ValueError("defun cannot capture variables created without "
-                           "using tf.get_variable. Op: %s" % op)
-        c_known_ops.add(op)
-        for i in op.inputs:
-          if i.op not in c_known_ops:
-            c_captured_tensors.add(i)
+          grad_ys=self._out_grad_placeholders,
+          src_graph=self._graph)
 
     backward_outputs = tuple(
         grad for grad in _flatten(in_gradients) if grad is not None)
     output_shapes = tuple(grad.shape for grad in backward_outputs)
 
-    captures = list(sorted(c_captured_tensors, key=lambda x: x.name))
+    ids = list(sorted(captures.keys()))
+    if ids:
+      extra_inputs, extra_placeholders = zip(*[captures[x] for x in ids])
+    else:
+      extra_inputs = []
+      extra_placeholders = []
+
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
-        filtered_outputs + captures, self._attrs)
-    all_inputs = self._out_grad_placeholders + captures
+        filtered_outputs + list(extra_inputs), self._attrs)
+    all_inputs = self._out_grad_placeholders + list(extra_placeholders)
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
     all_ignored_ops = frozenset(x.op for x in all_inputs)
@@ -508,11 +510,12 @@ class GraphModeFunction(object):
     # means rerunning the function-defining code will always define the same
     # function, which is useful if we serialize this etc.
     function_def_ops = tuple(x
-                             for x in sorted(c_known_ops, key=lambda x: x.name)
+                             for x in sorted(backwards_graph.get_operations(),
+                                             key=lambda x: x.name)
                              if x not in all_ignored_ops)
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
-        bname, all_inputs, [], self._graph, function_def_ops,
+        bname, all_inputs, [], backwards_graph, function_def_ops,
         backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
 
   def _backprop_call(self, args):
@@ -1077,7 +1080,7 @@ def defun(func=None, compiled=False):
   tf.enable_eager_execution()
 
   def fn():
-    x = tf.contrib.eager.Variable(0.0)
+    x = tf.Variable(0.0)
     x.assign_add(1.0)
     return x.read_value()
 
@@ -1094,19 +1097,18 @@ def defun(func=None, compiled=False):
   ```
 
   Finally, because each input signature is bound to a unique graph, if your
-  Python function constructs `tf.contrib.eager.Variable` objects, then each
-  graph constructed for that Python function will reference a unique set of
-  variables. To circumvent this problem, we recommend against compiling Python
-  functions that create `tf.contrib.eager.Variable` objects. Instead, Python
-  functions should either lexically close over `tf.contrib.eager.Variable`
-  objects or accept them as arguments, preferably encapsulated in an
-  object-oriented container. If you must create variables inside your Python
-  function and you want each graph generated for it to reference the same set of
-  variables, add logic to your Python function that ensures that variables are
-  only created the first time it is called and are reused for every subsequent
-  invocation; note that this is precisely what @{tf.keras.layers.Layer} objects
-  do, so we recommend using them to represent variable-bearing computations
-  whenever possible.
+  Python function constructs `tf.Variable` objects, then each graph constructed
+  for that Python function will reference a unique set of variables. To
+  circumvent this problem, we recommend against compiling Python functions that
+  create `tf.Variable` objects. Instead, Python functions should either
+  lexically close over `tf.Variable` objects or accept them as arguments,
+  preferably encapsulated in an object-oriented container. If you must create
+  variables inside your Python function and you want each graph generated for it
+  to reference the same set of variables, add logic to your Python function that
+  ensures that variables are only created the first time it is called and are
+  reused for every subsequent invocation; note that this is precisely what
+  @{tf.keras.layers.Layer} objects do, so we recommend using them to represent
+  variable-bearing computations whenever possible.
 
   Args:
     func: function to be compiled. If `func` is None, returns a
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index cdd9fe176092f9c7d3c7e2512714849c39285791..2e86563a7d0835424d77b5df31e7600cebd52a77 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
 from tensorflow.python.training import momentum
 from tensorflow.python.training import training_ops
 from tensorflow.python.util import compat
@@ -212,6 +213,19 @@ class FunctionTest(test.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @function.defun
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
   def testDefunCapturedInt32(self):
     x = constant_op.constant(1, dtype=dtypes.int32)
 
@@ -1166,6 +1180,23 @@ class AutomaticControlDependenciesTest(test.TestCase):
     value = train()
     self.assertEqual(value.numpy(), -1.0)
 
+  # TODO(b/111663004): This should work when the outer context is graph
+  # building.
+  def testOptimizerNonSlotVarsInDefunNoError(self):
+    def loss(v):
+      return v**2
+
+    optimizer = adam.AdamOptimizer(learning_rate=1.0)
+
+    @function.defun
+    def train():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      grad = backprop.implicit_grad(loss)(v)
+      optimizer.apply_gradients(grad)
+      return v.read_value()
+
+    train()
+
   def testOptimizerInDefunWithCapturedVariable(self):
     v = resource_variable_ops.ResourceVariable(1.0)
     def loss():
diff --git a/tensorflow/python/eager/memory_test.py b/tensorflow/python/eager/memory_test.py
index 74c6cbdd319a3a0476adbff08fc6e70fee65df5c..a1a59d511fdd4b831ea853b1f1cb3212322a3b84 100644
--- a/tensorflow/python/eager/memory_test.py
+++ b/tensorflow/python/eager/memory_test.py
@@ -24,6 +24,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -63,7 +65,7 @@ class MemoryTest(test.TestCase):
 
       initial = memory_profiler.memory_usage(-1)[0]
 
-      for _ in xrange(num_iters):
+      for _ in six.moves.range(num_iters):
         f()
 
       increase = memory_profiler.memory_usage(-1)[0] - initial
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index fc76ede4c502ae8b554c925a921e419bf003c40c..17a090d5262f790c92dfa1a92d47f9b5ac6c07d9 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -370,6 +370,10 @@ class OpsTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       float(x)
 
+  def testRange(self):
+    x = constant_op.constant(2)
+    self.assertEqual([0, 1], list(range(x)))
+
   def testFormatString(self):
     x = constant_op.constant(3.1415)
     self.assertEqual('3.14', '{:.2f}'.format(x))
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index cefd5b120615831970b44d9250afab242f048a5f..15d2ccf9d2b533ced7fd0d104d7a3be3c2ad4dd3 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -154,6 +154,7 @@ TFE_TensorHandle* EagerCast(TFE_Context* ctx, TFE_TensorHandle* handle,
   if (TF_GetCode(out_status) != TF_OK) RETURN_ERROR
   TFE_OpSetAttrType(op, "SrcT", src_type_enum);
   TFE_OpSetAttrType(op, "DstT", dst_type_enum);
+  TFE_OpSetAttrBool(op, "Truncate", false);
   TFE_TensorHandle* output = nullptr;
   int num_outputs = 1;
   TFE_Execute(op, &output, &num_outputs, out_status);
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 4d28e98961463d3da832562c6a1e5b95d31d86bf..0eabea321c120e1eb483b7161e88f964340db76d 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -845,11 +845,9 @@ int64_t get_uid() {
 PyObject* TFE_Py_UID() { return PyLong_FromLongLong(get_uid()); }
 
 void TFE_DeleteContextCapsule(PyObject* context) {
-  TF_Status* status = TF_NewStatus();
   TFE_Context* ctx =
       reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(context, nullptr));
-  TFE_DeleteContext(ctx, status);
-  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
 }
 
 static tensorflow::int64 MakeInt(PyObject* integer) {
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 6c415b1bf26aecf03437aea7f7e7aa3da79060e0..817c8e6848dcab5e04770e33dda803ed2a4a0c9a 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -40,9 +40,9 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":gc",
+        ":metric_keys",
+        ":util",
         "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:metric_keys",
-        "//tensorflow/python/estimator:util",
     ],
 )
 
@@ -171,6 +171,7 @@ py_test(
     name = "baseline_test",
     size = "medium",
     srcs = ["canned/baseline_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
@@ -207,6 +208,7 @@ py_test(
     name = "boosted_trees_test",
     size = "medium",
     srcs = ["canned/boosted_trees_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "optonly",
@@ -676,6 +678,7 @@ py_test(
     name = "keras_test",
     size = "large",
     srcs = ["keras_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
@@ -683,9 +686,9 @@ py_test(
     ],
     deps = [
         ":keras",
+        ":numpy_io",
+        ":run_config",
         "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:run_config",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 3292e2724d954b88c2ed71af4291b088684c770f..8b423f76de8e3b685e0426ef48fdd9f195925ff1 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -46,7 +46,7 @@ from tensorflow.python.util.tf_export import estimator_export
 # TODO(nponomareva): Reveal pruning params here.
 _TreeHParams = collections.namedtuple('TreeHParams', [
     'n_trees', 'max_depth', 'learning_rate', 'l1', 'l2', 'tree_complexity',
-    'min_node_weight', 'center_bias'
+    'min_node_weight', 'center_bias', 'pruning_mode'
 ])
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
@@ -410,9 +410,20 @@ class _EnsembleGrower(object):
     Args:
       tree_ensemble: A TreeEnsemble variable.
       tree_hparams: TODO. collections.namedtuple for hyper parameters.
+    Raises:
+      ValueError: when pruning mode is invalid or pruning is used and no tree
+      complexity is set.
     """
     self._tree_ensemble = tree_ensemble
     self._tree_hparams = tree_hparams
+    # pylint: disable=protected-access
+    self._pruning_mode_parsed = boosted_trees_ops.PruningMode.from_str(
+        tree_hparams.pruning_mode)
+
+    if (self._pruning_mode_parsed != boosted_trees_ops.PruningMode.NO_PRUNING
+        and tree_hparams.tree_complexity <= 0):
+      raise ValueError('For pruning, tree_complexity must be positive.')
+    # pylint: enable=protected-access
 
   @abc.abstractmethod
   def center_bias(self, center_bias_var, gradients, hessians):
@@ -500,7 +511,7 @@ class _EnsembleGrower(object):
         right_node_contribs=right_node_contribs_list,
         learning_rate=self._tree_hparams.learning_rate,
         max_depth=self._tree_hparams.max_depth,
-        pruning_mode=boosted_trees_ops.PruningMode.NO_PRUNING)
+        pruning_mode=self._pruning_mode_parsed)
     return grow_op
 
 
@@ -675,6 +686,7 @@ def _bt_model_fn(
   is_single_machine = (config.num_worker_replicas <= 1)
   sorted_feature_columns = sorted(feature_columns, key=lambda tc: tc.name)
   center_bias = tree_hparams.center_bias
+
   if train_in_memory:
     assert n_batches_per_layer == 1, (
         'When train_in_memory is enabled, input_fn should return the entire '
@@ -925,7 +937,8 @@ class BoostedTreesClassifier(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesClassifier` instance.
 
     Example:
@@ -999,7 +1012,11 @@ class BoostedTreesClassifier(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
-
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     Raises:
       ValueError: when wrong arguments are given or unsupported functionalities
@@ -1012,9 +1029,9 @@ class BoostedTreesClassifier(estimator.Estimator):
         n_classes, weight_column, label_vocabulary=label_vocabulary)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
-                                l1_regularization, l2_regularization,
-                                tree_complexity, min_node_weight, center_bias)
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
@@ -1058,7 +1075,8 @@ class BoostedTreesRegressor(estimator.Estimator):
                tree_complexity=0.,
                min_node_weight=0.,
                config=None,
-               center_bias=False):
+               center_bias=False,
+               pruning_mode='none'):
     """Initializes a `BoostedTreesRegressor` instance.
 
     Example:
@@ -1125,6 +1143,11 @@ class BoostedTreesRegressor(estimator.Estimator):
         regression problems, the first node will return the mean of the labels.
         For binary classification problems, it will return a logit for a prior
         probability of label 1.
+      pruning_mode: one of 'none', 'pre', 'post' to indicate no pruning, pre-
+        pruning (do not split a node if not enough gain is observed) and post
+        pruning (build the tree up to a max depth and then prune branches with
+        negative gain). For pre and post pruning, you MUST provide
+        tree_complexity >0.
 
     Raises:
       ValueError: when wrong arguments are given or unsupported functionalities
@@ -1136,9 +1159,9 @@ class BoostedTreesRegressor(estimator.Estimator):
     head = _create_regression_head(label_dimension, weight_column)
 
     # HParams for the model.
-    tree_hparams = _TreeHParams(n_trees, max_depth, learning_rate,
-                                l1_regularization, l2_regularization,
-                                tree_complexity, min_node_weight, center_bias)
+    tree_hparams = _TreeHParams(
+        n_trees, max_depth, learning_rate, l1_regularization, l2_regularization,
+        tree_complexity, min_node_weight, center_bias, pruning_mode)
 
     def _model_fn(features, labels, mode, config):
       return _bt_model_fn(  # pylint: disable=protected-access
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index f807641057990971407f69ff0ba4d3513302e452..ec597e468615e0bb16b832e82d4049ef5961c4c3 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -1508,7 +1508,8 @@ class ModelFnTests(test_util.TensorFlowTestCase):
         l2=0.01,
         tree_complexity=0.,
         min_node_weight=0.,
-        center_bias=center_bias)
+        center_bias=center_bias,
+        pruning_mode='none')
 
     estimator_spec = boosted_trees._bt_model_fn(  # pylint:disable=protected-access
         features=features,
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index 4f7c849ba4b058492c55dd27e0bf79f8d540ece9..9d49240fea4579fffe25172092080560ccd1d35d 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -47,3 +47,8 @@ class MetricKeys(object):
   PROBABILITY_MEAN_AT_CLASS = 'probability_mean/class%d'
   AUC_AT_CLASS = 'auc/class%d'
   AUC_PR_AT_CLASS = 'auc_precision_recall/class%d'
+
+  # The following require a class name applied.
+  PROBABILITY_MEAN_AT_NAME = 'probability_mean/%s'
+  AUC_AT_NAME = 'auc/%s'
+  AUC_PR_AT_NAME = 'auc_precision_recall/%s'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2fd6f6fab9c2aaea4106892174e8fa07190ab5f0..cc5a61b54efa9ea9187a1a791f3a5a9b929429ef 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -567,14 +567,19 @@ class Estimator(object):
 
   def _assert_members_are_not_overridden(self):
     """Asserts members of `Estimator` are not overridden."""
+    # TPUEstimator is special cased (owned by TF).
+    if self.__class__.__name__ == 'TPUEstimator':
+      return
+
     allowed_overrides = set([
-        '_call_input_fn', '_create_global_step',
+        '_call_input_fn', '_call_model_fn',
         '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
+        '_create_global_step', '_create_and_assert_global_step',
         '_tf_api_names', '_tf_api_names_v1', '_estimator_api_names',
         '_estimator_api_names_v1', '_estimator_api_constants',
         '_estimator_api_constants_v1',
         '_validate_features_in_predict_input',
-        '_call_model_fn', '_add_meta_graph_for_mode'
+        '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -901,9 +906,10 @@ class Estimator(object):
 
       with tf_session.Session(config=self._session_config) as session:
 
-        local_init_op = (
-            estimator_spec.scaffold.local_init_op or
-            monitored_session.Scaffold.default_local_init_op())
+        if estimator_spec.scaffold.local_init_op is not None:
+          local_init_op = estimator_spec.scaffold.local_init_op
+        else:
+          local_init_op = monitored_session.Scaffold.default_local_init_op()
 
         # This saver will be used both for restoring variables now,
         # and in saving out the metagraph below. This ensures that any
@@ -1155,13 +1161,19 @@ class Estimator(object):
     with ops.Graph().as_default() as g, g.device(self._device_fn):
       random_seed.set_random_seed(self._config.tf_random_seed)
       global_step_tensor = self._create_and_assert_global_step(g)
-      training_util._get_or_create_global_step_read()  # pylint: disable=protected-access
+
+      # Skip creating a read variable if _create_and_assert_global_step
+      # returns None (e.g. tf.contrib.estimator.SavedModelEstimator).
+      if global_step_tensor is not None:
+        training_util._get_or_create_global_step_read(g)  # pylint: disable=protected-access
+
       features, labels, input_hooks = (
           self._get_features_and_labels_from_input_fn(
               input_fn, model_fn_lib.ModeKeys.TRAIN))
       worker_hooks.extend(input_hooks)
       estimator_spec = self._call_model_fn(
           features, labels, model_fn_lib.ModeKeys.TRAIN, self.config)
+      global_step_tensor = training_util.get_global_step(g)
       return self._train_with_estimator_spec(estimator_spec, worker_hooks,
                                              hooks, global_step_tensor,
                                              saving_listeners)
@@ -1448,13 +1460,13 @@ class Estimator(object):
   def _evaluate_build_graph(self, input_fn, hooks=None, checkpoint_path=None):
     """Builds the graph and related hooks to run evaluation."""
     random_seed.set_random_seed(self._config.tf_random_seed)
-    global_step_tensor = self._create_and_assert_global_step(
-        ops.get_default_graph())
+    self._create_and_assert_global_step(ops.get_default_graph())
     features, labels, input_hooks = (
         self._get_features_and_labels_from_input_fn(input_fn,
                                                     model_fn_lib.ModeKeys.EVAL))
     estimator_spec = self._call_model_fn(
         features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
+    global_step_tensor = training_util.get_global_step(ops.get_default_graph())
 
     # Call to warm_start has to be after model_fn is called.
     self._maybe_warm_start(checkpoint_path)
@@ -1480,7 +1492,21 @@ class Estimator(object):
     all_hooks.extend(hooks)
     all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
 
-    return estimator_spec.scaffold, update_op, eval_dict, all_hooks
+    # New local variables have been added, so update the estimator spec's
+    # local init op if it was defined.
+    scaffold = estimator_spec.scaffold
+    if estimator_spec.scaffold and estimator_spec.scaffold.local_init_op:
+      # Ensure that eval step has been created before updating local init op.
+      evaluation._get_or_create_eval_step()  # pylint: disable=protected-access
+
+      scaffold = monitored_session.Scaffold(
+          local_init_op=control_flow_ops.group(
+              estimator_spec.scaffold.local_init_op,
+              monitored_session.Scaffold.default_local_init_op()),
+          copy_from_scaffold=scaffold
+      )
+
+    return scaffold, update_op, eval_dict, all_hooks
 
   def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
                     all_hooks, output_dir):
@@ -1911,6 +1937,19 @@ class WarmStartSettings(
     )
 
 
+def _get_saved_model_ckpt(saved_model_dir):
+  """Return path to variables checkpoint in a SavedModel directory."""
+  if not gfile.Exists(
+      os.path.join(compat.as_bytes(saved_model_dir),
+                   compat.as_bytes('variables/variables.index'))):
+    raise ValueError('Directory provided has an invalid SavedModel format: %s'
+                     % saved_model_dir)
+  return os.path.join(
+      compat.as_bytes(saved_model_dir),
+      compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
+                                     constants.VARIABLES_FILENAME)))
+
+
 def _get_default_warm_start_settings(warm_start_from):
   """Returns default WarmStartSettings.
 
@@ -1934,10 +1973,8 @@ def _get_default_warm_start_settings(warm_start_from):
     if gfile.Exists(os.path.join(compat.as_bytes(warm_start_from),
                                  compat.as_bytes('variables/variables.index'))):
       logging.info('Warm-starting from a SavedModel')
-      return WarmStartSettings(ckpt_to_initialize_from=os.path.join(
-          compat.as_bytes(warm_start_from),
-          compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
-                                         constants.VARIABLES_FILENAME))))
+      return WarmStartSettings(
+          ckpt_to_initialize_from=_get_saved_model_ckpt(warm_start_from))
     return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
   elif isinstance(warm_start_from, WarmStartSettings):
     return warm_start_from
diff --git a/tensorflow/python/estimator/keras.py b/tensorflow/python/estimator/keras.py
index 076359b503b37b0088282c941199257432ca1230..70517ae2783ff782a10e0d0f51e943eeabf0a045 100644
--- a/tensorflow/python/estimator/keras.py
+++ b/tensorflow/python/estimator/keras.py
@@ -21,11 +21,14 @@ from __future__ import print_function
 
 import os
 import re
+import tempfile
+
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.estimator.run_config import RunConfig
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
@@ -39,6 +42,7 @@ from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import distribute as distribute_lib
@@ -180,7 +184,7 @@ def _in_place_subclassed_model_reset(model):
   # Replace layers on the model with fresh layers
   layers_to_names = {value: key for key, value in attributes_cache.items()}
   original_layers = model._layers[:]
-  model._layers = []
+  model._layers = data_structures.NoDependency([])
   for layer in original_layers:  # We preserve layer order.
     config = layer.get_config()
     # This will not work for nested subclassed models used as layers.
@@ -228,7 +232,8 @@ def _in_place_subclassed_model_reset(model):
       ]
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
-  model._original_attributes_cache = attributes_cache
+  model._original_attributes_cache = data_structures.NoDependency(
+      attributes_cache)
   # Reset built state
   model.built = False
   model.inputs = None
@@ -426,29 +431,34 @@ def _create_keras_model_fn(keras_model, custom_objects=None):
   return model_fn
 
 
-def _save_first_checkpoint(keras_model, estimator, custom_objects,
-                           keras_weights):
+def _save_first_checkpoint(keras_model, custom_objects, config):
   """Save first checkpoint for the keras Estimator.
 
   Args:
     keras_model: an instance of compiled keras model.
-    estimator: keras estimator.
     custom_objects: Dictionary for custom objects.
-    keras_weights: A flat list of Numpy arrays for weights of given keras_model.
+    config: Estimator config.
 
   Returns:
-    The model_fn for a keras Estimator.
+    The path where keras model checkpoint is saved.
   """
+  # save checkpoint into subdirectory to allow warm start
+  keras_model_dir = os.path.join(config.model_dir, 'keras')
   # Load weights and save to checkpoint if there is no checkpoint
-  latest_path = saver_lib.latest_checkpoint(estimator.model_dir)
+  latest_path = saver_lib.latest_checkpoint(keras_model_dir)
   if not latest_path:
+    keras_weights = None
+    if _any_weight_initialized(keras_model):
+      keras_weights = keras_model.get_weights()
+    if not gfile.IsDirectory(keras_model_dir):
+      gfile.MakeDirs(keras_model_dir)
     with ops.Graph().as_default():
-      random_seed.set_random_seed(estimator.config.tf_random_seed)
+      random_seed.set_random_seed(config.tf_random_seed)
       training_util.create_global_step()
       model = _clone_and_build_model(model_fn_lib.ModeKeys.TRAIN, keras_model,
                                      custom_objects)
       # save to checkpoint
-      with session.Session(config=estimator._session_config) as sess:
+      with session.Session(config=config.session_config) as sess:
         if keras_weights:
           model.set_weights(keras_weights)
         # Make update ops and initialize all variables.
@@ -458,7 +468,46 @@ def _save_first_checkpoint(keras_model, estimator, custom_objects,
           K._initialize_variables(sess)
           # pylint: enable=protected-access
         saver = saver_lib.Saver()
-        saver.save(sess, os.path.join(estimator.model_dir, 'keras_model.ckpt'))
+        latest_path = os.path.join(keras_model_dir, 'keras_model.ckpt')
+        saver.save(sess, latest_path)
+  return latest_path
+
+
+def _maybe_overwrite_model_dir_and_session_config(config, model_dir):
+  """Overwrite estimator config by `model_dir` and `session_config` if needed.
+
+  Args:
+    config: Original estimator config.
+    model_dir: Estimator model checkpoint directory.
+
+  Returns:
+    Overwritten estimator config.
+
+  Raises:
+    ValueError: Model directory inconsistent between `model_dir` and `config`.
+  """
+
+  default_session_config = run_config_lib.get_default_session_config()
+  if isinstance(config, dict):
+    config = RunConfig(**config)
+  elif config is None:
+    config = RunConfig(session_config=default_session_config)
+  if config.session_config is None:
+    config = RunConfig.replace(config, session_config=default_session_config)
+
+  if model_dir is not None:
+    if (getattr(config, 'model_dir', None) is not None and
+        config.model_dir != model_dir):
+      raise ValueError(
+          "`model_dir` are set both in constructor and `RunConfig`, but with "
+          "different values. In constructor: '{}', in `RunConfig`: "
+          "'{}' ".format(model_dir, config.model_dir))
+    config = RunConfig.replace(config, model_dir=model_dir)
+  elif getattr(config, 'model_dir', None) is None:
+    model_dir = tempfile.mkdtemp()
+    config = RunConfig.replace(config, model_dir=model_dir)
+
+  return config
 
 
 def model_to_estimator(keras_model=None,
@@ -517,45 +566,39 @@ def model_to_estimator(keras_model=None,
         'Please compile the model with `model.compile()` '
         'before calling `model_to_estimator()`.')
 
-  if isinstance(config, dict):
-    config = run_config_lib.RunConfig(**config)
+  config = _maybe_overwrite_model_dir_and_session_config(config, model_dir)
 
   keras_model_fn = _create_keras_model_fn(keras_model, custom_objects)
-  estimator = estimator_lib.Estimator(
-      keras_model_fn, model_dir=model_dir, config=config)
-
-  # Check if we need to call get_weights:
   if _any_weight_initialized(keras_model):
-    keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
     # session sticks.
-    if estimator._session_config.HasField('gpu_options'):
+    if config.session_config.HasField('gpu_options'):
       logging.warning(
           'The Keras backend session has already been set. '
           'The _session_config passed to model_to_estimator will not be used.')
   else:
     # Pass the config into keras backend's default session.
-    sess = session.Session(config=estimator._session_config)
+    sess = session.Session(config=config.session_config)
     K.set_session(sess)
-    keras_weights = None
 
+  warm_start_path = None
   if keras_model._is_graph_network:
-    # TODO(yifeif): move checkpoint initialization to scaffold.init_fn
-    _save_first_checkpoint(keras_model,
-                           estimator,
-                           custom_objects,
-                           keras_weights)
+    warm_start_path = _save_first_checkpoint(keras_model, custom_objects,
+                                             config)
   elif keras_model.built:
-    logging.warning('You are creating an Estimator from a Keras model '
-                    'manually subclassed from `Model`, that was '
-                    'already called on some inputs (and thus already had '
-                    'weights). We are currently unable to preserve '
-                    'the model\'s state (its weights) '
-                    'as part of the estimator '
-                    'in this case. Be warned that the estimator '
-                    'has been created using '
-                    'a freshly initialized version of your model.\n'
-                    'Note that this doesn\'t affect the state of the '
-                    'model instance you passed as `keras_model` argument.')
+    logging.warning('You are creating an Estimator from a Keras model manually '
+                    'subclassed from `Model`, that was already called on some '
+                    'inputs (and thus already had weights). We are currently '
+                    'unable to preserve the model\'s state (its weights) as '
+                    'part of the estimator in this case. Be warned that the '
+                    'estimator has been created using a freshly initialized '
+                    'version of your model.\n'
+                    'Note that this doesn\'t affect the state of the model '
+                    'instance you passed as `keras_model` argument.')
+
+  estimator = estimator_lib.Estimator(keras_model_fn,
+                                      config=config,
+                                      warm_start_from=warm_start_path)
+
   return estimator
diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 7a3c5a9bf17e6d33df4bb55401e57a653a4cda1f..cf4ec7f4da2f2be075e59c9f4fe17980efd99d71 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -33,11 +33,13 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import rmsprop
+from tensorflow.python.training import session_run_hook
 
 
 try:
@@ -50,6 +52,8 @@ _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
 _NUM_CLASS = 2
 
+_TMP_DIR = '/tmp'
+
 
 def simple_sequential_model():
   model = keras.models.Sequential()
@@ -167,6 +171,12 @@ def multi_inputs_multi_outputs_model():
   return model
 
 
+class MyHook(session_run_hook.SessionRunHook):
+
+  def begin(self):
+    _ = variable_scope.get_variable('temp', [1])
+
+
 class TestKerasEstimator(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -203,6 +213,54 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       writer_cache.FileWriterCache.clear()
       gfile.DeleteRecursively(self._config.model_dir)
 
+  # see b/109935364
+  @test_util.run_in_graph_and_eager_modes
+  def test_train_with_hooks(self):
+    for model_type in ['sequential', 'functional']:
+      keras_model, (_, _), (
+          _, _), train_input_fn, eval_input_fn = get_resource_for_simple_model(
+              model_type=model_type, is_evaluate=True)
+      keras_model.compile(
+          loss='categorical_crossentropy',
+          optimizer=rmsprop.RMSPropOptimizer(1e-3),
+          metrics=['mse', keras.metrics.categorical_accuracy])
+
+      my_hook = MyHook()
+      with self.test_session():
+        est_keras = keras_lib.model_to_estimator(
+            keras_model=keras_model, config=self._config)
+        before_eval_results = est_keras.evaluate(
+            input_fn=eval_input_fn, steps=1)
+        est_keras.train(input_fn=train_input_fn, hooks=[my_hook],
+                        steps=_TRAIN_SIZE / 16)
+        after_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
+        self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+      writer_cache.FileWriterCache.clear()
+      gfile.DeleteRecursively(self._config.model_dir)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_train_with_model_fit_and_hooks(self):
+    keras_model, (x_train, y_train), _, \
+      train_input_fn, eval_input_fn = get_resource_for_simple_model(
+          model_type='sequential', is_evaluate=True)
+
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(1e-3),
+        metrics=['mse', keras.metrics.categorical_accuracy])
+    my_hook = MyHook()
+    with self.test_session():
+      keras_model.fit(x_train, y_train, epochs=1)
+
+      keras_est = keras_lib.model_to_estimator(
+          keras_model=keras_model, config=self._config)
+      before_eval_results = keras_est.evaluate(input_fn=eval_input_fn)
+      keras_est.train(input_fn=train_input_fn, hooks=[my_hook],
+                      steps=_TRAIN_SIZE / 16)
+      after_eval_results = keras_est.evaluate(input_fn=eval_input_fn, steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
   @test_util.run_in_graph_and_eager_modes
   def test_train_with_tf_optimizer(self):
     for model_type in ['sequential', 'functional']:
@@ -473,27 +531,43 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
         est_keras.train(input_fn=invald_output_name_input_fn, steps=100)
 
   def test_custom_objects(self):
-    
+
     def relu6(x):
       return keras.backend.relu(x, max_value=6)
-    
+
     keras_model = simple_functional_model(activation=relu6)
     keras_model.compile(loss='categorical_crossentropy', optimizer='adam')
     custom_objects = {
         'relu6': relu6
     }
 
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=_TRAIN_SIZE,
+        test_samples=50,
+        input_shape=(10,),
+        num_classes=2)
+    y_train = keras.utils.to_categorical(y_train, 2)
+    input_name = keras_model.input_names[0]
+    output_name = keras_model.output_names[0]
+    train_input_fn = numpy_io.numpy_input_fn(
+        x=randomize_io_type(x_train, input_name),
+        y=randomize_io_type(y_train, output_name),
+        shuffle=False,
+        num_epochs=None,
+        batch_size=16)
     with self.assertRaisesRegexp(ValueError, 'relu6'):
       with self.test_session():
-        keras_lib.model_to_estimator(
+        est = keras_lib.model_to_estimator(
             keras_model=keras_model,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
+        est.train(input_fn=train_input_fn, steps=1)
 
     with self.test_session():
-      keras_lib.model_to_estimator(
+      est = keras_lib.model_to_estimator(
           keras_model=keras_model,
           model_dir=tempfile.mkdtemp(dir=self._base_dir),
           custom_objects=custom_objects)
+      est.train(input_fn=train_input_fn, steps=1)
 
   def test_tf_config(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
@@ -530,12 +604,73 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
       gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
       sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
       self._config._session_config = sess_config
-      keras_lib.model_to_estimator(
-          keras_model=keras_model, config=self._config)
-      self.assertEqual(
-          keras.backend.get_session()
-          ._config.gpu_options.per_process_gpu_memory_fraction,
-          gpu_options.per_process_gpu_memory_fraction)
+      with self.test_session():
+        keras_lib.model_to_estimator(
+            keras_model=keras_model, config=self._config)
+        self.assertEqual(
+            keras.backend.get_session()
+            ._config.gpu_options.per_process_gpu_memory_fraction,
+            gpu_options.per_process_gpu_memory_fraction)
+
+  def test_with_empty_config(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, model_dir=self._base_dir,
+          config=run_config_lib.RunConfig())
+      self.assertEqual(run_config_lib.get_default_session_config(),
+                       est_keras._session_config)
+      self.assertEqual(est_keras._session_config,
+                       est_keras._config.session_config)
+      self.assertEqual(self._base_dir, est_keras._config.model_dir)
+      self.assertEqual(self._base_dir, est_keras._model_dir)
+
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, model_dir=self._base_dir,
+          config=None)
+      self.assertEqual(run_config_lib.get_default_session_config(),
+                       est_keras._session_config)
+      self.assertEqual(est_keras._session_config,
+                       est_keras._config.session_config)
+      self.assertEqual(self._base_dir, est_keras._config.model_dir)
+      self.assertEqual(self._base_dir, est_keras._model_dir)
+
+  def test_with_empty_config_and_empty_model_dir(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      with test.mock.patch.object(tempfile, 'mkdtemp', return_value=_TMP_DIR):
+        est_keras = keras_lib.model_to_estimator(
+            keras_model=keras_model,
+            config=run_config_lib.RunConfig())
+        self.assertEqual(est_keras._model_dir, _TMP_DIR)
+
+  def test_with_conflicting_model_dir_and_config(self):
+    keras_model, _, _, _, _ = get_resource_for_simple_model(
+        model_type='sequential', is_evaluate=True)
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer='rmsprop',
+        metrics=['mse', keras.metrics.categorical_accuracy])
+
+    with self.test_session():
+      with self.assertRaisesRegexp(ValueError, '`model_dir` are set both in '
+                                   'constructor and `RunConfig`'):
+        keras_lib.model_to_estimator(
+            keras_model=keras_model, model_dir=self._base_dir,
+            config=run_config_lib.RunConfig(model_dir=_TMP_DIR))
 
   def test_pretrained_weights(self):
     keras_model, (_, _), (_, _), _, _ = get_resource_for_simple_model()
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index a9fd8f8e1a4259fece1a5996343970900c853ce0..9db9ccd01d3fe996c2b047d65cc2f35747b005e1 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -380,15 +380,12 @@ def _maybe_add_default_serving_output(export_outputs):
   return export_outputs
 
 
-class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
-    'mode',
-    'predictions',
-    'loss',
-    'train_op',
-    'eval_metrics',
-    'export_outputs',
-    'scaffold_fn',
-    'host_call'])):
+class _TPUEstimatorSpec(
+    collections.namedtuple('TPUEstimatorSpec', [
+        'mode', 'predictions', 'loss', 'train_op', 'eval_metrics',
+        'export_outputs', 'scaffold_fn', 'host_call', 'training_hooks',
+        'evaluation_hooks', 'prediction_hooks'
+    ])):
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   This is a simplified implementation of `tf.contrib.tpu.EstimatorSpec`. See
@@ -404,17 +401,24 @@ class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
               eval_metrics=None,
               export_outputs=None,
               scaffold_fn=None,
-              host_call=None):
+              host_call=None,
+              training_hooks=None,
+              evaluation_hooks=None,
+              prediction_hooks=None):
     """Creates a `_TPUEstimatorSpec` instance."""
-    return super(_TPUEstimatorSpec, cls).__new__(cls,
-                                                 mode=mode,
-                                                 predictions=predictions,
-                                                 loss=loss,
-                                                 train_op=train_op,
-                                                 eval_metrics=eval_metrics,
-                                                 export_outputs=export_outputs,
-                                                 scaffold_fn=scaffold_fn,
-                                                 host_call=host_call)
+    return super(_TPUEstimatorSpec, cls).__new__(
+        cls,
+        mode=mode,
+        predictions=predictions,
+        loss=loss,
+        train_op=train_op,
+        eval_metrics=eval_metrics,
+        export_outputs=export_outputs,
+        scaffold_fn=scaffold_fn,
+        host_call=host_call,
+        training_hooks=training_hooks,
+        evaluation_hooks=evaluation_hooks,
+        prediction_hooks=prediction_hooks)
 
   def as_estimator_spec(self):
     """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
@@ -423,12 +427,16 @@ class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
     else:
       metric_fn, tensors = self.eval_metrics
       eval_metric_ops = metric_fn(**tensors)
-    return EstimatorSpec(mode=self.mode,
-                         predictions=self.predictions,
-                         loss=self.loss,
-                         train_op=self.train_op,
-                         eval_metric_ops=eval_metric_ops,
-                         export_outputs=self.export_outputs)
+    return EstimatorSpec(
+        mode=self.mode,
+        predictions=self.predictions,
+        loss=self.loss,
+        train_op=self.train_op,
+        eval_metric_ops=eval_metric_ops,
+        export_outputs=self.export_outputs,
+        training_hooks=self.training_hooks,
+        evaluation_hooks=self.evaluation_hooks,
+        prediction_hooks=self.prediction_hooks)
 
 
 def _check_is_tensor_or_operation(x, name):
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index a79073b748e1f3c39faa71b7656341b302d4d688..7719d0301987e6e0f0d98e52cf4a5332e523f63e 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -87,6 +87,53 @@ def _parse_message(message):
   return seps, tags
 
 
+def _compute_device_summary_from_list(device_assignment_list, prefix=""):
+  """Return a summary of an op's device function stack.
+
+  Args:
+    device_assignment_list: The op._device_assignments list.
+    prefix:  An optional string prefix used before each line of the multi-
+        line string returned by this function.
+
+  Returns:
+    A multi-line string similar to:
+        Device assignments active during op creation:
+          with tf.device(/cpu:0): <test_1.py:27>
+          with tf.device(some_func<foo.py, 123>): <test_2.py:38>
+    The first line will have no padding to its left by default.  Subsequent
+    lines will have two spaces of left-padding.  Use the prefix argument
+    to increase indentation.
+  """
+  if not device_assignment_list:
+    message = "No device assignments were active during op creation."
+    return prefix + message
+
+  str_list = []
+  str_list.append("%sDevice assignments active during op creation:" % prefix)
+
+  for traceable_obj in device_assignment_list:
+    location_summary = "<{file}:{line}>".format(file=traceable_obj.filename,
+                                                line=traceable_obj.lineno)
+    subs = {
+        "prefix": prefix,
+        "indent": "  ",
+        "dev_name": traceable_obj.obj,
+        "loc": location_summary,
+    }
+    str_list.append(
+        "{prefix}{indent}with tf.device({dev_name}): {loc}".format(**subs))
+
+  return "\n".join(str_list)
+
+
+def _compute_device_assignment_summary_from_op(op, prefix=""):
+  if not op:
+    return ""
+  # pylint: disable=protected-access
+  return _compute_device_summary_from_list(op._device_assignments, prefix)
+  # pylint: enable=protected-access
+
+
 def _compute_colocation_summary_from_dict(colocation_dict, prefix=""):
   """Return a summary of an op's colocation stack.
 
@@ -203,6 +250,7 @@ def _compute_field_dict(op):
       "file": default_value,
       "line": default_value,
       "colocations": default_value,
+      "devices": default_value,
   }
   frame = _get_defining_frame_from_op(op)
   if frame:
@@ -211,6 +259,9 @@ def _compute_field_dict(op):
   colocation_summary = _compute_colocation_summary_from_op(op)
   if colocation_summary:
     field_dict["colocations"] = colocation_summary
+  device_summary = _compute_device_assignment_summary_from_op(op)
+  if device_summary:
+    field_dict["devices"] = device_summary
 
   return field_dict
 
@@ -233,6 +284,8 @@ def interpolate(error_message, graph):
 
   node_name_to_substitution_dict = {}
   for name in [t.name for t in tags]:
+    if name in node_name_to_substitution_dict:
+      continue
     try:
       op = graph.get_operation_by_name(name)
     except KeyError:
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 1e5cb738540f6cbe5f108b1f8e403dcb0ee3f458..fbf182879b17f4008712f861cfbf013b45c2380b 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -57,13 +57,32 @@ def _modify_op_stack_with_filenames(op, num_user_frames, user_filename,
   op._traceback = stack
 
 
-def assert_node_in_colocation_summary(test_obj, colocation_summary_string,
-                                      name, filename="", lineno=""):
-  lineno = str(lineno)
-  name_phrase = "colocate_with(%s)" % name
-  for term in [name_phrase, filename, lineno]:
-    test_obj.assertIn(term, colocation_summary_string)
-  test_obj.assertNotIn("loc:@", colocation_summary_string)
+class ComputeDeviceSummaryFromOpTest(test.TestCase):
+
+  def testCorrectFormatWithActiveDeviceAssignments(self):
+    assignments = []
+    assignments.append(
+        traceable_stack.TraceableObject("/cpu:0",
+                                        filename="hope.py",
+                                        lineno=24))
+    assignments.append(
+        traceable_stack.TraceableObject("/gpu:2",
+                                        filename="please.py",
+                                        lineno=42))
+
+    summary = error_interpolation._compute_device_summary_from_list(
+        assignments, prefix="  ")
+
+    self.assertIn("tf.device(/cpu:0)", summary)
+    self.assertIn("<hope.py:24>", summary)
+    self.assertIn("tf.device(/gpu:2)", summary)
+    self.assertIn("<please.py:42>", summary)
+
+  def testCorrectFormatWhenNoColocationsWereActive(self):
+    device_assignment_list = []
+    summary = error_interpolation._compute_device_summary_from_list(
+        device_assignment_list, prefix="  ")
+    self.assertIn("No device assignments", summary)
 
 
 class ComputeColocationSummaryFromOpTest(test.TestCase):
@@ -81,15 +100,10 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     }
     summary = error_interpolation._compute_colocation_summary_from_dict(
         colocation_dict, prefix="  ")
-    assert_node_in_colocation_summary(self,
-                                      summary,
-                                      name="test_node_1",
-                                      filename="test_1.py",
-                                      lineno=27)
-    assert_node_in_colocation_summary(self, summary,
-                                      name="test_node_2",
-                                      filename="test_2.py",
-                                      lineno=38)
+    self.assertIn("colocate_with(test_node_1)", summary)
+    self.assertIn("<test_1.py:27>", summary)
+    self.assertIn("colocate_with(test_node_2)", summary)
+    self.assertIn("<test_2.py:38>", summary)
 
   def testCorrectFormatWhenNoColocationsWereActive(self):
     colocation_dict = {}
@@ -98,9 +112,10 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
-class InterpolateTest(test.TestCase):
+class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
+    ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     constant_op.constant(1, name="One")
     constant_op.constant(2, name="Two")
@@ -177,9 +192,57 @@ class InterpolateTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, expected_regex)
 
 
+class InterpolateDeviceSummaryTest(test.TestCase):
+
+  def _fancy_device_function(self, unused_op):
+    return "/cpu:*"
+
+  def setUp(self):
+    ops.reset_default_graph()
+    self.zero = constant_op.constant([0.0], name="zero")
+    with ops.device("/cpu"):
+      self.one = constant_op.constant([1.0], name="one")
+      with ops.device("/cpu:0"):
+        self.two = constant_op.constant([2.0], name="two")
+    with ops.device(self._fancy_device_function):
+      self.three = constant_op.constant(3.0, name="three")
+
+    self.graph = self.three.graph
+
+  def testNodeZeroHasNoDeviceSummaryInfo(self):
+    message = "^^node:zero:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    self.assertIn("No device assignments were active", result)
+
+  def testNodeOneHasExactlyOneInterpolatedDevice(self):
+    message = "^^node:one:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(1, num_devices)
+    self.assertIn("tf.device(/cpu)", result)
+
+  def testNodeTwoHasTwoInterpolatedDevice(self):
+    message = "^^node:two:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(2, num_devices)
+    self.assertIn("tf.device(/cpu)", result)
+    self.assertIn("tf.device(/cpu:0)", result)
+
+  def testNodeThreeHasFancyFunctionDisplayNameForInterpolatedDevice(self):
+    message = "^^node:three:${devices}^^"
+    result = error_interpolation.interpolate(message, self.graph)
+    num_devices = result.count("tf.device")
+    self.assertEqual(1, num_devices)
+    name_re = r"_fancy_device_function<.*error_interpolation_test.py, [0-9]+>"
+    expected_re = r"with tf.device\(.*%s\)" % name_re
+    self.assertRegexpMatches(result, expected_re)
+
+
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
+    ops.reset_default_graph()
     # Add nodes to the graph for retrieval by name later.
     node_one = constant_op.constant(1, name="One")
     node_two = constant_op.constant(2, name="Two")
@@ -203,12 +266,12 @@ class InterpolateColocationSummaryTest(test.TestCase):
   def testNodeThreeHasColocationInterpolation(self):
     message = "^^node:Three_with_one:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="One")
+    self.assertIn("colocate_with(One)", result)
 
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "^^node:Four_with_three:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="Three_with_one")
+    self.assertIn("colocate_with(Three_with_one)", result)
     self.assertNotIn(
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s"
@@ -217,8 +280,8 @@ class InterpolateColocationSummaryTest(test.TestCase):
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "^^node:Five_with_one_with_two:${colocations}^^"
     result = error_interpolation.interpolate(message, self.graph)
-    assert_node_in_colocation_summary(self, result, name="One")
-    assert_node_in_colocation_summary(self, result, name="Two")
+    self.assertIn("colocate_with(One)", result)
+    self.assertIn("colocate_with(Two)", result)
 
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "^^node:One:${colocations}^^"
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 17d112a1ece9ae3d121b894d9b246d24b46d84e2..2e3e15f53a919bac669b56e4a8f27c1808da345a 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -6,6 +6,13 @@ cimport numpy as np
 
 from tensorflow.python.util import compat
 
+def AppendBFloat16ArrayToTensorProto(
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
 
 def AppendFloat16ArrayToTensorProto(
     # For numpy, npy_half is a typedef for npy_uint16,
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 6525607faea62a461ee38fa0393ac29b809bb9b6..c76743d2c629b8f3fb6961602a1575209967339a 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -38,8 +38,8 @@ from tensorflow.python.ops import cond_v2_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 # This is to avoid a circular dependency with cond_v2_impl.
@@ -255,9 +255,12 @@ class _DefinedFunction(object):
     # Constructed only when C API is enabled, lazily
     self._c_func = None
     self._sub_functions = dict()  # Constructed with _definition or _c_func
-    device_stack = ops.get_default_graph()._device_function_stack  # pylint: disable=protected-access
+    # pylint: disable=protected-access
+    device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
+    # pylint: enable=protected-access
+
     # Get the innermost device if possbile.
-    self._caller_device = device_stack[-1] if device_stack else None
+    self._caller_device = device_funcs[-1] if device_funcs else None
 
     # Cached OpDef for this function. When C API is enabled, this is
     # the only part of FunctionDef that we cache in Python. When C API
@@ -354,7 +357,7 @@ class _DefinedFunction(object):
     if self._func_name:
       base_func_name = self._func_name
     else:
-      base_func_name = _get_func_name(self._func)
+      base_func_name = function_utils.get_func_name(self._func)
       if self._grad_func:
         base_func_name += ("_%s" % self._grad_func.name)
     kwargs_attr = _parse_kwargs_as_attrs(base_func_name, **self._extra_kwargs)
@@ -841,7 +844,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
     ValueError: if func returns None.
   """
   if not name:
-    name = _get_func_name(func)
+    name = function_utils.get_func_name(func)
   func_graph = _FuncGraph(name, capture_by_value)
 
   with func_graph.as_default(), ops.device(device):
@@ -1139,19 +1142,6 @@ def _parse_kwargs_as_attrs(func_name, **kwargs):
   return attrs
 
 
-def _get_func_name(func):
-  _, func = tf_decorator.unwrap(func)
-  if callable(func):
-    if tf_inspect.isfunction(func):
-      return func.__name__
-    elif tf_inspect.ismethod(func):
-      return "%s.%s" % (func.__self__.__name__, func.__name__)
-    else:  # Probably a class instance with __call__
-      return type(func)
-  else:
-    raise ValueError("Argument must be callable")
-
-
 def get_extra_vars():
   """Returns the captured variables by the function.
 
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
index 46c9c4c14adc7d4adeb11b45210cb296acb55086..1b09506662d26a4d0be8e7d77f7f0fea61d6835b 100644
--- a/tensorflow/python/framework/function_def_to_graph.py
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -25,7 +25,7 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
-from tensorflow.python.framework import op_def_registry
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import cond_v2_impl
 
@@ -114,6 +114,10 @@ def function_def_to_graph_def(fdef, input_shapes=None):
           producer=versions.GRAPH_DEF_VERSION,
           min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER))
 
+  # Copy *all* functions from outer graph to `graph_def` so that both direct
+  # and indirect references are safely handled.
+  ops.get_default_graph()._copy_functions_to_graph_def(graph_def, 0)  # pylint: disable=protected-access
+
   if input_shapes and len(input_shapes) != len(fdef.signature.input_arg):
     raise ValueError("Length of input_shapes must match the number of " +
                      "input_args. len(input_shapes): {} len(input_arg): {}".
@@ -142,24 +146,18 @@ def function_def_to_graph_def(fdef, input_shapes=None):
     nested_to_flat_tensor_name[arg_def.name] = "{}:0".format(arg_def.name)
 
   for node_def in fdef.node_def:
-    op_def = op_def_registry.get_registered_ops().get(node_def.op)
-    if not op_def:
-      # TODO(b/80470245): Support functions which refer other functions.
-      raise NotImplementedError(
-          "No op registered for {},".format(node_def.op) +
-          " it may be a function. function_def_to_graph_def " +
-          "currently does not support converting functions with " +
-          "references to other graph functions.")
+    op_def = ops.get_default_graph()._get_op_def(node_def.op)  # pylint: disable=protected-access
 
     for attr in op_def.attr:
-      if attr.type in ("func", "list(func)"):
-        # TODO(b/80470245): Support functions which refer other functions.
-        raise NotImplementedError("Unsupported attr {} ".format(attr.name) +
-                                  " with type {}".format(attr.type) +
-                                  " in op {}. ".format(op_def.name) +
-                                  "function_def_to_graph_def currently does " +
-                                  "not support converting functions with " +
-                                  "references to other graph functions.")
+      if attr.type == "func":
+        fname = node_def.attr[attr.name].func.name
+        if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+          raise ValueError("%s function not found." % fname)
+      elif attr.type == "list(func)":
+        for fn in node_def.attr[attr.name].list.func:
+          fname = fn.name
+          if not ops.get_default_graph()._is_function(fname):  # pylint: disable=protected-access
+            raise ValueError("%s function not found." % fname)
 
     # Iterate over output_args in op_def to build the map.
     # Index of the output tensor in the flattened list of *all* output
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index 0f4e6ef54fb02cc6ba52c9de2ccabea982fd2323..cd2a16ed5a83f8ffcd8288967815b7ad2761c52f 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
@@ -79,7 +81,6 @@ class FunctionDefToGraphTest(test.TestCase):
 
     g = function_def_to_graph.function_def_to_graph(
         fdef, input_shapes=[None, tensor_shape.matrix(5, 7)])
-    print(g.as_graph_def())
     self.assertIsNone(g.inputs[0].shape.dims)
     self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7])
     self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7])
@@ -179,6 +180,37 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
+  def testFunctionCallsFromFunction(self):
+    x = constant_op.constant(5.0)
+    y = constant_op.constant(10.0)
+
+    @function.Defun()
+    def fn():
+
+      @function.Defun()
+      def inner_fn():
+        return x + y
+
+      return inner_fn()
+
+    # Instantiate the function in this graph so that
+    # `function_def_to_graph` can find it.
+    fn()
+
+    def fn2():
+      return 2 * fn()
+
+    fdef = function._DefinedFunction(fn2, [], []).definition
+    func_graph = function_def_to_graph.function_def_to_graph(fdef)
+    with func_graph.as_default():
+      x_ph, y_ph = func_graph.inputs
+      with self.test_session(graph=func_graph) as sess:
+        self.assertEqual(
+            sess.run(func_graph.outputs[0], feed_dict={
+                x_ph: 5.0,
+                y_ph: 10.0
+            }), 30.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 699d2b70d176db7718a6e480f9f7b08a65ae6a8e..687bfebd4306596233df8db6a639e65df2f85980 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -205,7 +205,7 @@ def _PopulateTFImportGraphDefOptions(options, prefix, input_map,
   for input_src, input_dst in input_map.items():
     input_src = compat.as_str(input_src)
     if input_src.startswith('^'):
-      src_name = compat.as_bytes(input_src[1:])
+      src_name = compat.as_str(input_src[1:])
       dst_op = input_dst._as_tf_output().oper  # pylint: disable=protected-access
       c_api.TF_ImportGraphDefOptionsRemapControlDependency(
           options, src_name, dst_op)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index c5a54470d27b5949fd642b057feda7f3f1a4347f..7182c28666e8ab81962ed12f6490bc67144aa75d 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
@@ -419,6 +420,46 @@ class ImportGraphDefTest(test.TestCase):
       with self.test_session() as sess:
         self.assertEqual(sess.run(imported_r), 10)
 
+  def testImportWhileLoopInCond(self):
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef inside a cond and make sure it runs.
+    with ops.Graph().as_default():
+
+      def ImportFn():
+        return importer.import_graph_def(graph_def, return_elements=[r.name])[0]
+
+      pred = array_ops.placeholder(dtypes.bool)
+      out = control_flow_ops.cond(pred, ImportFn,
+                                  lambda: constant_op.constant(1))
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(out, {pred: True}), 10)
+        self.assertEqual(sess.run(out, {pred: False}), 1)
+
+  def testImportWhileLoopInWhileLoop(self):
+    self.skipTest("b/111757448")
+    # Produce GraphDef containing while loop.
+    graph = ops.Graph()
+    with graph.as_default():
+      r = control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + 1, [0])
+    graph_def = graph.as_graph_def()
+
+    # Import the GraphDef inside another loop and make sure it runs.
+    with ops.Graph().as_default():
+
+      def ImportFn(_):
+        return importer.import_graph_def(graph_def, return_elements=[r.name])[0]
+
+      out = control_flow_ops.while_loop(
+          lambda i: i < 2, ImportFn, [0],
+          shape_invariants=[tensor_shape.TensorShape(None)])
+      with self.test_session() as sess:
+        self.assertEqual(sess.run(out), 10)
+
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
     error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
diff --git a/tensorflow/python/framework/kernels.py b/tensorflow/python/framework/kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7641f3442e4c5a6508a3463c700ade97ce202a9
--- /dev/null
+++ b/tensorflow/python/framework/kernels.py
@@ -0,0 +1,46 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions for querying registered kernels."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import kernel_def_pb2
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.util import compat
+
+
+def get_all_registered_kernels():
+  """Returns a KernelList proto of all registered kernels.
+  """
+  buf = c_api.TF_GetAllRegisteredKernels()
+  data = c_api.TF_GetBuffer(buf)
+  kernel_list = kernel_def_pb2.KernelList()
+  kernel_list.ParseFromString(compat.as_bytes(data))
+  return kernel_list
+
+
+def get_registered_kernels_for_op(name):
+  """Returns a KernelList proto of registered kernels for a given op.
+
+  Args:
+    name: A string representing the name of the op whose kernels to retrieve.
+  """
+  buf = c_api.TF_GetRegisteredKernelsForOp(name)
+  data = c_api.TF_GetBuffer(buf)
+  kernel_list = kernel_def_pb2.KernelList()
+  kernel_list.ParseFromString(compat.as_bytes(data))
+  return kernel_list
diff --git a/tensorflow/python/framework/kernels_test.py b/tensorflow/python/framework/kernels_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53500be73a05b2d9b379fd61e899a091b7db9b1
--- /dev/null
+++ b/tensorflow/python/framework/kernels_test.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for querying registered kernels."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import kernels
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class GetAllRegisteredKernelsTest(test_util.TensorFlowTestCase):
+
+  def testFindsAtLeastOneKernel(self):
+    kernel_list = kernels.get_all_registered_kernels()
+    self.assertGreater(len(kernel_list.kernel), 0)
+
+
+class GetRegisteredKernelsForOp(test_util.TensorFlowTestCase):
+
+  def testFindsAtLeastOneKernel(self):
+    kernel_list = kernels.get_registered_kernels_for_op("KernelLabel")
+    self.assertGreater(len(kernel_list.kernel), 0)
+    self.assertEqual(kernel_list.kernel[0].op, "KernelLabel")
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/meta_graph.py b/tensorflow/python/framework/meta_graph.py
index 923e76fc9c8f231cc9a43bc05280dac1ea458d3c..33631282bd03a15daddb334e6f40e6b52f84c750 100644
--- a/tensorflow/python/framework/meta_graph.py
+++ b/tensorflow/python/framework/meta_graph.py
@@ -696,6 +696,67 @@ def import_scoped_meta_graph(meta_graph_or_file,
   Raises:
     ValueError: If the graph_def contains unbound inputs.
   """
+  return import_scoped_meta_graph_with_return_elements(
+      meta_graph_or_file, clear_devices, graph, import_scope, input_map,
+      unbound_inputs_col_name, restore_collections_predicate)[0]
+
+
+def import_scoped_meta_graph_with_return_elements(
+    meta_graph_or_file,
+    clear_devices=False,
+    graph=None,
+    import_scope=None,
+    input_map=None,
+    unbound_inputs_col_name="unbound_inputs",
+    restore_collections_predicate=(lambda key: True),
+    return_elements=None):
+  """Imports graph from `MetaGraphDef` and returns vars and return elements.
+
+  This function takes a `MetaGraphDef` protocol buffer as input. If
+  the argument is a file containing a `MetaGraphDef` protocol buffer ,
+  it constructs a protocol buffer from the file content. The function
+  then adds all the nodes from the `graph_def` field to the
+  current graph, recreates the desired collections, and returns a dictionary of
+  all the Variables imported into the name scope.
+
+  In combination with `export_scoped_meta_graph()`, this function can be used to
+
+  * Serialize a graph along with other Python objects such as `QueueRunner`,
+    `Variable` into a `MetaGraphDef`.
+
+  * Restart training from a saved graph and checkpoints.
+
+  * Run inference from a saved graph and checkpoints.
+
+  Args:
+    meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
+      the path) containing a `MetaGraphDef`.
+    clear_devices: Boolean which controls whether to clear device information
+      from graph_def. Default false.
+    graph: The `Graph` to import into. If `None`, use the default graph.
+    import_scope: Optional `string`. Name scope into which to import the
+      subgraph. If `None`, the graph is imported to the root name scope.
+    input_map: A dictionary mapping input names (as strings) in `graph_def` to
+      `Tensor` objects. The values of the named input tensors in the imported
+      graph will be re-mapped to the respective `Tensor` values.
+    unbound_inputs_col_name: Collection name for looking up unbound inputs.
+    restore_collections_predicate: a predicate on collection names. A collection
+      named c (i.e whose key is c) will be restored iff
+      1) `restore_collections_predicate(c)` is True, and
+      2) `c != unbound_inputs_col_name`.
+    return_elements:  A list of strings containing operation names in the
+      `MetaGraphDef` that will be returned as `Operation` objects; and/or
+      tensor names in `MetaGraphDef` that will be returned as `Tensor` objects.
+
+  Returns:
+    A tuple of (
+      dictionary of all the `Variables` imported into the name scope,
+      list of `Operation` or `Tensor` objects from the `return_elements` list).
+
+  Raises:
+    ValueError: If the graph_def contains unbound inputs.
+
+  """
   if context.executing_eagerly():
     raise ValueError("Exporting/importing meta graphs is not supported when "
                      "eager execution is enabled.")
@@ -737,11 +798,12 @@ def import_scoped_meta_graph(meta_graph_or_file,
     scope_to_prepend_to_names = graph.unique_name(
         import_scope or "", mark_as_used=False)
 
-    importer.import_graph_def(
+    imported_return_elements = importer.import_graph_def(
         input_graph_def,
         name=(import_scope or scope_to_prepend_to_names),
         input_map=input_map,
-        producer_op_list=producer_op_list)
+        producer_op_list=producer_op_list,
+        return_elements=return_elements)
 
     # Restores all the other collections.
     variable_objects = {}
@@ -806,7 +868,7 @@ def import_scoped_meta_graph(meta_graph_or_file,
     for v in variables:
       var_list[ops.strip_name_scope(v.name, scope_to_prepend_to_names)] = v
 
-  return var_list
+  return var_list, imported_return_elements
 
 
 def export_scoped_meta_graph(filename=None,
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b813cd6c0683b6a30094a04f39f37e832be6b0ef..c25e29b0f46d2050098aedb3f82e0c1029f435a7 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -50,14 +50,15 @@ from tensorflow.python.framework import registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.framework import versions
-from tensorflow.python.util import tf_stack
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import lock_util
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_stack
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
@@ -73,6 +74,31 @@ def tensor_id(tensor):
   return tensor._id  # pylint: disable=protected-access
 
 
+class _UserDeviceSpec(object):
+  """Store user-specified device and provide computation of merged device."""
+
+  def __init__(self, device_name_or_function):
+    self._device_name_or_function = device_name_or_function
+
+    self.display_name = str(self._device_name_or_function)
+    if callable(self._device_name_or_function):
+      dev_func = self._device_name_or_function
+      func_name = function_utils.get_func_name(dev_func)
+      func_code = function_utils.get_func_code(dev_func)
+      if func_code:
+        fname = func_code.co_filename
+        lineno = func_code.co_firstlineno
+      else:
+        fname = "unknown"
+        lineno = -1
+      self.display_name = "%s<%s, %d>" % (func_name, fname, lineno)
+
+    self.function = self._device_name_or_function
+    if not (self._device_name_or_function is None or
+            callable(self._device_name_or_function)):
+      self.function = pydev.merge_device(self._device_name_or_function)
+
+
 class _NullContextmanager(object):
 
   def __enter__(self):
@@ -709,7 +735,7 @@ class _EagerTensorBase(Tensor):
       raise ValueError("Resource handles are not convertible to numpy.")
     return self._cpu_nograd()._numpy()  # pylint: disable=protected-access
 
-  # __int__ and  __float__ may copy the tensor to CPU and
+  # __int__, __float__ and __index__ may copy the tensor to CPU and
   # only work for scalars; values are cast as per numpy.
   def __int__(self):
     return int(self.numpy())
@@ -717,6 +743,9 @@ class _EagerTensorBase(Tensor):
   def __float__(self):
     return float(self.numpy())
 
+  def __index__(self):
+    return int(self.numpy())
+
   def __array__(self, dtype=None):
     return np.array(self.numpy(), dtype=dtype)
 
@@ -1716,7 +1745,12 @@ class Operation(object):
     self._id_value = self._graph._next_id()
     self._original_op = original_op
     self._traceback = tf_stack.extract_stack()
-    # List of traceable_stack.TraceableObjects for colocation context managers.
+
+    # List of _UserDevSpecs holding code location of device context manager
+    # invocations and the users original argument to them.
+    self._device_code_locations = None
+    # Dict mapping op name to file and line information for op colocation
+    # context managers.
     self._colocation_code_locations = None
     self._control_flow_context = self.graph._get_control_flow_context()
     # pylint: enable=protected-access
@@ -1857,6 +1891,37 @@ class Operation(object):
     """
     return c_api.TF_OperationDevice(self._c_op)
 
+  @property
+  def _device_assignments(self):
+    """Code locations for device context managers active at op creation.
+
+    This property will return a list of traceable_stack.TraceableObject
+    instances where .obj is a string representing the assigned device
+    (or information about the function that would be applied to this op
+    to compute the desired device) and the filename and lineno members
+    record the location of the relevant device context manager.
+
+    For example, suppose file_a contained these lines:
+
+      file_a.py:
+        15: with tf.device('/gpu:0'):
+        16:   node_b = tf.constant(4, name='NODE_B')
+
+    Then a TraceableObject t_obj representing the device context manager
+    would have these member values:
+
+      t_obj.obj -> '/gpu:0'
+      t_obj.filename = 'file_a.py'
+      t_obj.lineno = 15
+
+    and node_b.op._device_assignments would return the list [t_obj].
+
+    Returns:
+      [str: traceable_stack.TraceableObject, ...] as per this method's
+      description, above.
+    """
+    return self._device_code_locations or []
+
   @property
   def _colocation_dict(self):
     """Code locations for colocation context managers active at op creation.
@@ -1878,11 +1943,10 @@ class Operation(object):
     would have these member values:
 
       t_obj.obj -> None
-      t_obj.name = 'NODE_A'
       t_obj.filename = 'file_a.py'
       t_obj.lineno = 15
 
-    and node_b.op._colocation_code_locations would return the dictionary
+    and node_b.op._colocation_dict would return the dictionary
 
       { 'NODE_A': t_obj }
 
@@ -2732,7 +2796,7 @@ class Graph(object):
     # Functions that will be applied to choose a device if none is specified.
     # After switch_to_thread_local(), self._thread_local._device_function_stack
     # is used instead.
-    self._graph_device_function_stack = []
+    self._graph_device_function_stack = traceable_stack.TraceableStack()
     # Default original_op applied to new ops.
     self._default_original_op = None
     # Current control flow context. It could be either CondContext or
@@ -3776,8 +3840,8 @@ class Graph(object):
       Nothing.
     """
     old_original_op = self._default_original_op
+    self._default_original_op = op
     try:
-      self._default_original_op = op
       yield
     finally:
       self._default_original_op = old_original_op
@@ -3894,15 +3958,15 @@ class Graph(object):
         # op name regex, which constrains the initial character.
         if not _VALID_OP_NAME_REGEX.match(name):
           raise ValueError("'%s' is not a valid scope name" % name)
+    old_stack = self._name_stack
+    if not name:  # Both for name=None and name="" we re-set to empty scope.
+      new_stack = None
+    elif name[-1] == "/":
+      new_stack = _name_from_scope_name(name)
+    else:
+      new_stack = self.unique_name(name)
+    self._name_stack = new_stack
     try:
-      old_stack = self._name_stack
-      if not name:  # Both for name=None and name="" we re-set to empty scope.
-        new_stack = None
-      elif name[-1] == "/":
-        new_stack = _name_from_scope_name(name)
-      else:
-        new_stack = self.unique_name(name)
-      self._name_stack = new_stack
       yield "" if new_stack is None else new_stack + "/"
     finally:
       self._name_stack = old_stack
@@ -3983,8 +4047,8 @@ class Graph(object):
                                   ignore_existing=False):
     with self.colocate_with(op, ignore_existing):
       if gradient_uid is not None and self._control_flow_context is not None:
+        self._control_flow_context.EnterGradientColocation(op, gradient_uid)
         try:
-          self._control_flow_context.EnterGradientColocation(op, gradient_uid)
           yield
         finally:
           self._control_flow_context.ExitGradientColocation(op, gradient_uid)
@@ -4026,7 +4090,6 @@ class Graph(object):
     Yields:
       A context manager that specifies the op with which to colocate
       newly created ops.
-
     """
     if op is None and not ignore_existing:
       raise ValueError("Trying to reset colocation (op is None) but "
@@ -4044,7 +4107,7 @@ class Graph(object):
     # In the future, a caller may specify that device_functions win
     # over colocation, in which case we can add support.
     device_fn_tmp = self._device_function_stack
-    self._device_function_stack = []
+    self._device_function_stack = traceable_stack.TraceableStack()
 
     if ignore_existing:
       current_stack = self._colocation_stack
@@ -4068,6 +4131,13 @@ class Graph(object):
       if ignore_existing:
         self._colocation_stack = current_stack
 
+  def _add_device_to_stack(self, device_name_or_function, offset=0):
+    """Add device to stack manually, separate from a context manager."""
+    total_offset = 1 + offset
+    spec = _UserDeviceSpec(device_name_or_function)
+    self._device_function_stack.push_obj(spec, offset=total_offset)
+    return spec
+
   @tf_contextlib.contextmanager
   def device(self, device_name_or_function):
     # pylint: disable=line-too-long
@@ -4125,31 +4195,26 @@ class Graph(object):
     Yields:
       A context manager that specifies the default device to use for newly
       created ops.
-
     """
-    # pylint: enable=line-too-long
-    if (device_name_or_function is not None and
-        not callable(device_name_or_function)):
-      device_function = pydev.merge_device(device_name_or_function)
-    else:
-      device_function = device_name_or_function
-
+    self._add_device_to_stack(device_name_or_function, offset=2)
     try:
-      self._device_function_stack.append(device_function)
       yield
     finally:
-      self._device_function_stack.pop()
+      self._device_function_stack.pop_obj()
 
   def _apply_device_functions(self, op):
     """Applies the current device function stack to the given operation."""
-    # Apply any device functions in reverse order, so that the most recently
+    # Apply any device functions in LIFO order, so that the most recently
     # pushed function has the first chance to apply a device to the op.
     # We apply here because the result can depend on the Operation's
     # signature, which is computed in the Operation constructor.
-    for device_function in reversed(self._device_function_stack):
-      if device_function is None:
+    # pylint: disable=protected-access
+    for device_spec in self._device_function_stack.peek_objs():
+      if device_spec.function is None:
         break
-      op._set_device(device_function(op))  # pylint: disable=protected-access
+      op._set_device(device_spec.function(op))
+    op._device_code_locations = self._snapshot_device_function_stack_metadata()
+    # pylint: enable=protected-access
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
@@ -4198,8 +4263,8 @@ class Graph(object):
         yields the container name.
     """
     original_container = self._container
+    self._container = container_name
     try:
-      self._container = container_name
       yield self._container
     finally:
       self._container = original_container
@@ -4673,17 +4738,45 @@ class Graph(object):
     if self._stack_state_is_thread_local:
       # This may be called from a thread where device_function_stack doesn't yet
       # exist.
+      # pylint: disable=protected-access
       if not hasattr(self._thread_local, "_device_function_stack"):
-        self._thread_local._device_function_stack = (
-            self._graph_device_function_stack[:])
+        stack_copy_for_this_thread = self._graph_device_function_stack.copy()
+        self._thread_local._device_function_stack = stack_copy_for_this_thread
       return self._thread_local._device_function_stack
+      # pylint: enable=protected-access
     else:
       return self._graph_device_function_stack
 
+  @property
+  def _device_functions_outer_to_inner(self):
+    user_device_specs = self._device_function_stack.peek_objs()
+    device_functions = [spec.function for spec in user_device_specs]
+    device_functions_outer_to_inner = list(reversed(device_functions))
+    return device_functions_outer_to_inner
+
+  def _snapshot_device_function_stack_metadata(self):
+    """Return device function stack as a list of TraceableObjects.
+
+    Returns:
+      [traceable_stack.TraceableObject, ...] where each TraceableObject's .obj
+      member is a displayable name for the user's argument to Graph.device, and
+      the filename and lineno members point to the code location where
+      Graph.device was called directly or indirectly by the user.
+    """
+    traceable_objects = self._device_function_stack.peek_traceable_objs()
+    snapshot = []
+    for obj in traceable_objects:
+      obj_copy = obj.copy_metadata()
+      obj_copy.obj = obj.obj.display_name
+      snapshot.append(obj_copy)
+    return snapshot
+
   @_device_function_stack.setter
   def _device_function_stack(self, device_function_stack):
     if self._stack_state_is_thread_local:
+      # pylint: disable=protected-access
       self._thread_local._device_function_stack = device_function_stack
+      # pylint: enable=protected-access
     else:
       self._graph_device_function_stack = device_function_stack
 
@@ -4693,12 +4786,12 @@ class Graph(object):
     if self._stack_state_is_thread_local:
       # This may be called from a thread where colocation_stack doesn't yet
       # exist.
+      # pylint: disable=protected-access
       if not hasattr(self._thread_local, "_colocation_stack"):
         stack_copy_for_this_thread = self._graph_colocation_stack.copy()
-        # pylint: disable=protected-access
         self._thread_local._colocation_stack = stack_copy_for_this_thread
-        # pylint: enable=protected-access
       return self._thread_local._colocation_stack
+      # pylint: enable=protected-access
     else:
       return self._graph_colocation_stack
 
@@ -4710,7 +4803,9 @@ class Graph(object):
   @_colocation_stack.setter
   def _colocation_stack(self, colocation_stack):
     if self._stack_state_is_thread_local:
+      # pylint: disable=protected-access
       self._thread_local._colocation_stack = colocation_stack
+      # pylint: enable=protected-access
     else:
       self._graph_colocation_stack = colocation_stack
 
@@ -4879,8 +4974,8 @@ class _DefaultStack(threading.local):
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     """A context manager for manipulating a default stack."""
+    self.stack.append(default)
     try:
-      self.stack.append(default)
       yield default
     finally:
       # stack may be empty if reset() was called
@@ -5068,13 +5163,15 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
   def get_controller(self, default):
+    context.context().context_switches.push(
+        default.building_function, default.as_default)
     try:
-      context.context().context_switches.push(
-          default.building_function, default.as_default)
       with super(_DefaultGraphStack, self).get_controller(
           default) as g, context.graph_mode():
         yield g
     finally:
+      # If an exception is raised here it may be hiding a related exception in
+      # the try-block (just above).
       context.context().context_switches.pop()
 
 
@@ -5110,6 +5207,9 @@ def init_scope():
         `init_scope` will simply install a fresh graph as the default one.
 
     (3) The gradient tape is paused while the scope is active.
+
+  Raises:
+    RuntimeError: if graph state is incompatible with this initialization.
   """
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
@@ -5122,10 +5222,10 @@ def init_scope():
     # the name scope of the current context.
     default_graph = get_default_graph()
     scope = default_graph.get_name_scope()
-    if scope and scope[-1] != '/':
+    if scope and scope[-1] != "/":
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
-      scope = scope + '/'
+      scope = scope + "/"
     inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
@@ -5170,6 +5270,8 @@ def init_scope():
           outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
         yield
     finally:
+      # If an exception is raised here it may be hiding a related exception in
+      # try-block (just above).
       if outer_graph is not None:
         outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
@@ -5237,7 +5339,10 @@ def enable_eager_execution(config=None,
      to this function.
   """
   return enable_eager_execution_internal(
-      config, device_policy, execution_mode, None)
+      config=config,
+      device_policy=device_policy,
+      execution_mode=execution_mode,
+      server_def=None)
 
 
 def enable_eager_execution_internal(config=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index f848b69782e0181a963fbdb177cc5c2771bf7ce9..48328a7f58da60d273f01afbb9a970a66c23c612 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import threading
 import weakref
 
@@ -2542,6 +2543,56 @@ class StatisticsTest(test_util.TensorFlowTestCase):
     self.assertEqual(3, flops_total.value)
 
 
+class DeviceStackTest(test_util.TensorFlowTestCase):
+
+  def testBasicDeviceAssignmentMetadata(self):
+
+    def device_func(unused_op):
+      return "/cpu:*"
+
+    const_zero = constant_op.constant([0.0], name="zero")
+    with ops.device("/cpu"):
+      const_one = constant_op.constant([1.0], name="one")
+      with ops.device("/cpu:0"):
+        const_two = constant_op.constant([2.0], name="two")
+    with ops.device(device_func):
+      const_three = constant_op.constant(3.0, name="three")
+
+    self.assertEqual(0, len(const_zero.op._device_assignments))
+
+    one_list = const_one.op._device_assignments
+    self.assertEqual(1, len(one_list))
+    self.assertEqual("/cpu", one_list[0].obj)
+    self.assertEqual("ops_test.py", os.path.basename(one_list[0].filename))
+
+    two_list = const_two.op._device_assignments
+    self.assertEqual(2, len(two_list))
+    devices = [t.obj for t in two_list]
+    self.assertEqual(set(["/cpu", "/cpu:0"]), set(devices))
+
+    three_list = const_three.op._device_assignments
+    self.assertEqual(1, len(three_list))
+    func_description = three_list[0].obj
+    expected_regex = r"device_func<.*ops_test.py, [0-9]+"
+    self.assertRegexpMatches(func_description, expected_regex)
+
+  def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
+
+    with ops.device("/cpu"):
+      const_one = constant_op.constant([1.0], name="one")
+    with ops.get_default_graph().device("/cpu"):
+      const_two = constant_op.constant([2.0], name="two")
+
+    one_metadata = const_one.op._device_assignments[0]
+    two_metadata = const_two.op._device_assignments[0]
+
+    # Verify both types of device assignment return the right stack info.
+    self.assertRegexpMatches("ops_test.py",
+                             os.path.basename(one_metadata.filename))
+    self.assertEqual(one_metadata.filename, two_metadata.filename)
+    self.assertEqual(one_metadata.lineno + 2, two_metadata.lineno)
+
+
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -2554,13 +2605,17 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       c.op.get_attr("_class")
 
-    # Roughly test that stack information is being saved correctly for the op.
-    locations_dict = b.op._colocation_dict
-    self.assertIn("a", locations_dict)
-    metadata = locations_dict["a"]
+  def testBasicColocationMetadata(self):
+    const_two = constant_op.constant([2.0], name="two")
+    with ops.colocate_with(const_two.op):
+      const_three = constant_op.constant(3.0, name="three")
+    locations_dict = const_three.op._colocation_dict
+    self.assertIn("two", locations_dict)
+    metadata = locations_dict["two"]
     self.assertIsNone(metadata.obj)
-    basename = metadata.filename.split("/")[-1]
-    self.assertEqual("ops_test.py", basename)
+    # Check that this test's filename is recorded as the file containing the
+    # colocation statement.
+    self.assertEqual("ops_test.py", os.path.basename(metadata.filename))
 
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index ca63efbc84dab20850845841e9e212a681b6bb06..9a0f34fad2ea789175786ec89cc1156061218610 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -67,10 +67,16 @@ def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromBFloat16(x) for x in proto_values])
 
 
+def FastAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  fast_tensor_util.AppendBFloat16ArrayToTensorProto(
+      tensor_proto, np.asarray(
+          proto_values, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
+
+
 if _FAST_TENSOR_UTIL_AVAILABLE:
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
-          SlowAppendBFloat16ArrayToTensorProto,
+          FastAppendBFloat16ArrayToTensorProto,
       np.float16:
           _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
@@ -935,8 +941,10 @@ def constant_value_as_shape(tensor):  # pylint: disable=invalid-name
 def is_tensor(x):  # pylint: disable=invalid-name
   """Check whether `x` is of tensor type.
 
-  Check whether an object is a tensor. Equivalent to
-  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])`.
+  Check whether an object is a tensor. This check is equivalent to calling
+  `isinstance(x, [tf.Tensor, tf.SparseTensor, tf.Variable])` and also checks
+  if all the component variables of a MirroredVariable or a TowerLocalVariable
+  are tensors.
 
   Args:
     x: A python object to check.
@@ -944,4 +952,5 @@ def is_tensor(x):  # pylint: disable=invalid-name
   Returns:
     `True` if `x` is a tensor, `False` if not.
   """
-  return isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x)  # pylint: disable=protected-access
+  return (isinstance(x, ops._TensorLike) or ops.is_dense_tensor_like(x) or  # pylint: disable=protected-access
+          (hasattr(x, "is_tensor_like") and x.is_tensor_like))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 2bc2a189fa8e825613ca834e2c06ea916074d455..fc47b1cca51c977a9398cf1c8a7c09cb0a088037 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+from collections import OrderedDict
 import contextlib
 import gc
 import itertools
@@ -571,6 +573,78 @@ def assert_no_garbage_created(f):
   return decorator
 
 
+def _combine_named_parameters(**kwargs):
+  """Generate combinations based on its keyword arguments.
+
+  Two sets of returned combinations can be concatenated using +.  Their product
+  can be computed using `times()`.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  if not kwargs:
+    return [OrderedDict()]
+
+  sort_by_key = lambda k: k[0][0]
+  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
+  first = list(kwargs.items())[0]
+
+  rest = dict(list(kwargs.items())[1:])
+  rest_combined = _combine_named_parameters(**rest)
+
+  key = first[0]
+  values = first[1]
+  if not isinstance(values, list):
+    values = [values]
+
+  combinations = [
+      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
+      for v in values
+      for combined in rest_combined
+  ]
+  return combinations
+
+
+def generate_combinations_with_testcase_name(**kwargs):
+  """Generate combinations based on its keyword arguments using combine().
+
+  This function calls combine() and appends a testcase name to the list of
+  dictionaries returned. The 'testcase_name' key is a required for named
+  parameterized tests.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  combinations = _combine_named_parameters(**kwargs)
+  named_combinations = []
+  for combination in combinations:
+    assert isinstance(combination, OrderedDict)
+    name = "".join([
+        "_{}_{}".format(
+            "".join(filter(str.isalnum, key)),
+            "".join(filter(str.isalnum, str(value))))
+        for key, value in combination.items()
+    ])
+    named_combinations.append(
+        OrderedDict(
+            list(combination.items()) + [("testcase_name",
+                                          "_test{}".format(name))]))
+
+  return named_combinations
+
+
 def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
@@ -1227,8 +1301,8 @@ class TensorFlowTestCase(googletest.TestCase):
       a = a._asdict()
     if hasattr(b, "_asdict"):
       b = b._asdict()
-    a_is_dict = isinstance(a, dict)
-    if a_is_dict != isinstance(b, dict):
+    a_is_dict = isinstance(a, collections.Mapping)
+    if a_is_dict != isinstance(b, collections.Mapping):
       raise ValueError("Can't compare dict to non-dict, a%s vs b%s. %s" %
                        (path_str, path_str, msg))
     if a_is_dict:
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 122c14c8473f133f6a3bed1e6297394eaa1b845c..f983cbef0415de98fd6cae717f12fbcde3fe78bd 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -73,7 +73,7 @@ class TestUtilTest(test_util.TensorFlowTestCase):
     test_util.assert_equal_graph_def(def_57, def_75)
     # Compare two unequal graphs
     with self.assertRaisesRegexp(AssertionError,
-                                 r"^Found unexpected node 'seven"):
+                                 r"^Found unexpected node '{{node seven}}"):
       test_util.assert_equal_graph_def(def_57, def_empty)
 
   def testIsGoogleCudaEnabled(self):
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 7d07c77c797668c858014cc31cf713050627d72f..8cc971c61d5964d0fad1bfa843c3ef8d3407599f 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1340,7 +1340,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('Conv2D-0', nodes)
-      self.assertAllEqual(output_val_ref, output_val)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoop(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index a495d485455ac6871a78e0d79e3136e799c4f527..df409d2aa5c5911a2de4253445a1f8b7e5a184df 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -732,7 +732,6 @@ py_test(
     size = "medium",
     srcs = ["preprocessing/image_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["nomsan"],  # TODO(b/110990716) reenable
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index f608dea430f0573503713f0cbc60f8921e6df51e..99645de736fc9e3f34c3ea29171cde0f91d8345a 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -128,20 +128,26 @@ def softsign(x):
 
 
 @tf_export('keras.activations.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified Linear Unit.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
-      x: Input tensor.
-      alpha: Slope of the negative part. Defaults to zero.
-      max_value: Maximum value for the output.
+      x: A tensor or variable.
+      alpha: A scalar, slope of negative section (default=`0.`).
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
-      The (leaky) rectified linear unit activation: `x` if `x > 0`,
-        `alpha * x` if `x < 0`. If `max_value` is defined, the result
-        is truncated to this value.
+      A tensor.
   """
-  return K.relu(x, alpha=alpha, max_value=max_value)
+  return K.relu(x, alpha=alpha, max_value=max_value, threshold=threshold)
 
 
 @tf_export('keras.activations.tanh')
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 333f927d2f5cb981b392617ab79232754b2d93e3..38794f1612d7509cb9e75631679712dbb6729c89 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -3372,26 +3372,48 @@ def in_test_phase(x, alt, training=None):
 
 
 @tf_export('keras.backend.relu')
-def relu(x, alpha=0., max_value=None):
+def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
   With default values, it returns element-wise `max(x, 0)`.
 
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = alpha * (x - threshold)` otherwise.
+
   Arguments:
       x: A tensor or variable.
       alpha: A scalar, slope of negative section (default=`0.`).
-      max_value: Saturation threshold.
+      max_value: float. Saturation threshold.
+      threshold: float. Threshold value for thresholded activation.
 
   Returns:
       A tensor.
   """
+  clip_max = max_value is not None
+
   if alpha != 0.:
-    negative_part = nn.relu(-x)
-  x = nn.relu(x)
-  if max_value is not None:
+    if threshold != 0:
+      negative_part = nn.relu(-x + threshold)
+    else:
+      negative_part = nn.relu(-x)
+
+  if threshold != 0:
+    # computes x for x > threshold else 0
+    x = x * math_ops.cast(math_ops.greater(x, threshold), floatx())
+  elif max_value == 6:
+    # if no threshold, then can use nn.relu6 native TF op for performance
+    x = nn.relu6(x)
+    clip_max = False
+  else:
+    x = nn.relu(x)
+
+  if clip_max:
     max_value = _to_tensor(max_value, x.dtype.base_dtype)
     zero = _to_tensor(0., x.dtype.base_dtype)
     x = clip_ops.clip_by_value(x, zero, max_value)
+
   if alpha != 0.:
     alpha = _to_tensor(alpha, x.dtype.base_dtype)
     x -= alpha * negative_part
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 36478ea089a871667908d70e33422aef8444a3e4..40e79100613daef6731cfc71d5aad0cb0b6cc275 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -23,6 +23,7 @@ import scipy.sparse
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -490,6 +491,66 @@ class BackendLinearAlgebraTest(test.TestCase):
                                        input_shape_a=(4, 7),
                                        input_shape_b=(4, 7))
 
+  def test_relu(self):
+    x = ops.convert_to_tensor([[-4, 0], [2, 7]], 'float32')
+    with self.test_session():
+      # standard relu
+      relu_op = keras.backend.relu(x)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # alpha
+      relu_op = keras.backend.relu(x, alpha=0.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, 0], [2, 7]])
+
+      # max_value < some elements
+      relu_op = keras.backend.relu(x, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 5]])
+
+      # nn.relu6 used
+      relu_op = keras.backend.relu(x, max_value=6)
+      self.assertTrue('Relu6' in relu_op.name)  # uses tf.nn.relu6
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 6]])
+
+      # max value > 6
+      relu_op = keras.backend.relu(x, max_value=10)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # max value is float
+      relu_op = keras.backend.relu(x, max_value=4.3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 4.3]])
+
+      # max value == 0
+      relu_op = keras.backend.relu(x, max_value=0)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 0]])
+
+      # alpha and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, max_value=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-1, 0], [2, 3]])
+
+      # threshold
+      relu_op = keras.backend.relu(x, threshold=3)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 7]])
+
+      # threshold is float
+      relu_op = keras.backend.relu(x, threshold=1.5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [2, 7]])
+
+      # threshold is negative
+      relu_op = keras.backend.relu(x, threshold=-5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-4, 0], [2, 7]])
+
+      # threshold and max_value
+      relu_op = keras.backend.relu(x, threshold=3, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[0, 0], [0, 5]])
+
+      # threshold and alpha
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 7]])
+
+      # threshold, alpha, and max_value
+      relu_op = keras.backend.relu(x, alpha=0.25, threshold=4, max_value=5)
+      self.assertAllClose(keras.backend.eval(relu_op), [[-2, -1], [-0.5, 5]])
+
 
 class BackendShapeOpsTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 0857a3279f118bca48d762e00248a2908f2a180e..d1b9dc27bdb78e2a576021ccc5f93201c318c9b4 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -740,6 +740,7 @@ class TensorBoard(Callback):
     self.write_images = write_images
     self.batch_size = batch_size
     self._current_batch = 0
+    self._total_batches_seen = 0
     # abstracted writer class to be able to stub for testing
     self._writer_class = tf_summary.FileWriter
     self.embeddings_freq = embeddings_freq
@@ -883,6 +884,24 @@ class TensorBoard(Callback):
         self._epoch + self._current_val_batch / self._validation_batches)
     self._current_val_batch += 1
 
+  def _write_custom_summaries(self, step, logs=None):
+    """Writes metrics out as custom scalar summaries.
+
+    Arguments:
+        step: the global step to use for Tensorboard.
+        logs: dict. Keys are scalar summary names, values are
+            NumPy scalars.
+
+    """
+    logs = logs or {}
+    for name, value in logs.items():
+      summary = tf_summary.Summary()
+      summary_value = summary.value.add()
+      summary_value.simple_value = value.item()
+      summary_value.tag = name
+      self.writer.add_summary(summary, step)
+    self.writer.flush()
+
   def on_train_begin(self, logs=None):
     """Checks if histogram summaries can be run."""
 
@@ -899,6 +918,16 @@ class TensorBoard(Callback):
         raise ValueError(
             'If printing histograms, validation data must have length > 0.')
 
+  def on_batch_end(self, batch, logs=None):
+    """Writes scalar summaries for metrics on every training batch."""
+    # Don't output batch_size and batch number as Tensorboard summaries
+    logs = logs or {}
+    batch_logs = {('batch_' + k): v
+                  for k, v in logs.items()
+                  if k not in ['batch', 'size']}
+    self._write_custom_summaries(self._total_batches_seen, batch_logs)
+    self._total_batches_seen += 1
+
   def on_epoch_begin(self, epoch, logs=None):
     """Add histogram op to Model test_function callbacks, reset batch count."""
 
@@ -915,7 +944,12 @@ class TensorBoard(Callback):
   def on_epoch_end(self, epoch, logs=None):
     """Checks if summary ops should run next epoch, logs scalar summaries."""
 
-    logs = logs or {}
+    # don't output batch_size and
+    # batch number as Tensorboard summaries
+    logs = {('epoch_' + k): v
+            for k, v in logs.items()
+            if k not in ['batch', 'size']}
+    self._write_custom_summaries(epoch, logs)
 
     # pop the histogram summary op after each epoch
     if self.histogram_freq:
@@ -964,16 +998,6 @@ class TensorBoard(Callback):
 
           i += self.batch_size
 
-    for name, value in logs.items():
-      if name in ['batch', 'size']:
-        continue
-      summary = tf_summary.Summary()
-      summary_value = summary.value.add()
-      summary_value.simple_value = value.item()
-      summary_value.tag = name
-      self.writer.add_summary(summary, epoch)
-    self.writer.flush()
-
   def on_train_end(self, logs=None):
     self.writer.close()
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 45598cafd34f23981bf68ebb2dfa36421a0bae5d..7d830078ce07c8b2a6fd2d86c0fc32cd20806ff9 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -1096,6 +1096,74 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  def test_Tensorboard_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+        self.batches_logged = []
+        self.summary_values = []
+        self.summary_tags = []
+
+      def add_summary(self, summary, step):
+        self.summary_values.append(summary.value[0].simple_value)
+        self.summary_tags.append(summary.value[0].tag)
+        self.batches_logged.append(step)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    logdir = 'fake_dir'
+
+    # log every batch
+    tb_cbk = keras.callbacks.TensorBoard(logdir)
+    tb_cbk.writer = FileWriterStub(logdir)
+
+    for batch in range(5):
+      tb_cbk.on_batch_end(batch, {'acc': np.float32(batch)})
+    self.assertEqual(tb_cbk.writer.batches_logged, [0, 1, 2, 3, 4])
+    self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
+    self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
+
+  def test_Tensorboard_epoch_and_batch_logging(self):
+
+    class FileWriterStub(object):
+
+      def __init__(self, logdir, graph=None):
+        self.logdir = logdir
+        self.graph = graph
+
+      def add_summary(self, summary, step):
+        if 'batch_' in summary.value[0].tag:
+          self.batch_summary = (step, summary)
+        elif 'epoch_' in summary.value[0].tag:
+          self.epoch_summary = (step, summary)
+
+      def flush(self):
+        pass
+
+      def close(self):
+        pass
+
+    logdir = 'fake_dir'
+
+    tb_cbk = keras.callbacks.TensorBoard(logdir)
+    tb_cbk.writer = FileWriterStub(logdir)
+
+    tb_cbk.on_batch_end(0, {'acc': np.float32(5.0)})
+    tb_cbk.on_epoch_end(0, {'acc': np.float32(10.0)})
+    batch_step, batch_summary = tb_cbk.writer.batch_summary
+    self.assertEqual(batch_step, 0)
+    self.assertEqual(batch_summary.value[0].simple_value, 5.0)
+    epoch_step, epoch_summary = tb_cbk.writer.epoch_summary
+    self.assertEqual(epoch_step, 0)
+    self.assertEqual(epoch_summary.value[0].simple_value, 10.0)
+
   def test_RemoteMonitorWithJsonPayload(self):
     if requests is None:
       self.skipTest('`requests` required to run this test')
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e02792208bdedb85601f0eacdf19836d331b1804..c7ab6750cb4cee2a3b286f59723b98f9336a1a80 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -26,6 +26,7 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -174,6 +175,12 @@ class Layer(checkpointable.CheckpointableBase):
 
     self.supports_masking = False
 
+    call_argspec = tf_inspect.getargspec(self.call)
+    if 'training' in call_argspec.args:
+      self._expects_training_arg = True
+    else:
+      self._expects_training_arg = False
+
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
@@ -723,9 +730,17 @@ class Layer(checkpointable.CheckpointableBase):
             self._dtype = input_list[0].dtype.base_dtype.name
           except AttributeError:
             pass
+
         if all(hasattr(x, 'shape') for x in input_list):
           input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-        self.build(input_shapes)
+
+        if (not hasattr(self, '_is_graph_network') or
+            self.__class__.__name__ == 'Sequential'):
+          # Only if self is a layer or an instance of a sequential model do we
+          # need to build it.
+          self.build(input_shapes)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
         self.built = True
 
       # Check input assumptions set after layer building, e.g. input shape.
@@ -778,17 +793,8 @@ class Layer(checkpointable.CheckpointableBase):
     if hasattr(self, '_initial_weights') and self._initial_weights is not None:
       self.set_weights(self._initial_weights)
       del self._initial_weights
-    self._post_build_cleanup()
     return outputs
 
-  def _post_build_cleanup(self):
-    """Hooks to run after all sub-Layers are built."""
-    # Note that in addition to Layer.__call__, this method is called by Model
-    # after building a graph network (which skips __call__). It should be called
-    # when possible if self.built may have switched from False to True, and is
-    # idempotent.
-    pass  # No-op for Layers which don't override this method.
-
   def apply(self, inputs, *args, **kwargs):
     """Apply the layer on a input.
 
@@ -962,6 +968,39 @@ class Layer(checkpointable.CheckpointableBase):
     Returns:
         An input shape tuple.
     """
+    if context.executing_eagerly():
+      # In this case we build the model first in order to do shape inference.
+      # This is acceptable because the framework only calls
+      # `compute_output_shape` on shape values that the layer would later be
+      # built for. It would however cause issues in case a user attempts to
+      # use `compute_output_shape` manually (these users will have to
+      # implement `compute_output_shape` themselves).
+      self.build(input_shape)
+
+      with context.graph_mode():
+        graph = eager_function.CapturingGraph({})
+        with graph.as_default():
+          if isinstance(input_shape, list):
+            inputs = [generate_placeholders_from_shape(shape)
+                      for shape in input_shape]
+          else:
+            inputs = generate_placeholders_from_shape(input_shape)
+
+          try:
+            if self._expects_training_arg:
+              outputs = self(inputs, training=False)
+            else:
+              outputs = self(inputs)
+          except TypeError:
+            raise NotImplementedError('We could not automatically infer '
+                                      'the static shape of the layer\'s output.'
+                                      ' Please implement the '
+                                      '`compute_output_shape` method on your '
+                                      'layer (%s).' % self.__class__.__name__)
+      if isinstance(outputs, list):
+        return [output.shape for output in outputs]
+      else:
+        return outputs.shape
     raise NotImplementedError
 
   def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
@@ -1917,3 +1956,18 @@ def make_variable(name,
       synchronization=synchronization,
       aggregation=aggregation)
   return v
+
+
+def generate_dummy_data_from_shape(shape):
+  if isinstance(shape, tensor_shape.TensorShape):
+    shape = shape.as_list()
+
+  # Replace Nones in input shape with dummy `1` value
+  shape = [x.value if isinstance(x, tensor_shape.Dimension) else x
+           for x in shape]
+  shape = [1 if x is None else x for x in shape]
+  return array_ops.ones(shape, dtype=backend.floatx())
+
+
+def generate_placeholders_from_shape(shape):
+  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index a4d96de74fc90e31d52f9a67e845a84f9ceb5034..d9b031d080c91af033194209d31b3c13ed6c75cd 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
-import functools
 import json
 import os
 import weakref
@@ -30,6 +29,7 @@ from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -144,10 +144,6 @@ class Network(base_layer.Layer):
 
     self._checkpointable_saver = checkpointable_utils.CheckpointableSaver(
         weakref.ref(self))
-    # A zero-argument function which should be called and set back to None as
-    # soon as the network is built (only applicable to subclassed Models). Runs
-    # restore operations when graph building.
-    self._in_progress_restore_finalizer = None
 
   @checkpointable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
@@ -318,8 +314,8 @@ class Network(base_layer.Layer):
     else:
       self._expects_training_arg = False
     self._call_convention = self._determine_call_convention(call_argspec)
-    self.outputs = None
-    self.inputs = None
+    self.outputs = []
+    self.inputs = []
     self.built = False
 
   def _determine_call_convention(self, call_argspec):
@@ -739,6 +735,86 @@ class Network(base_layer.Layer):
       return specs[0]
     return specs
 
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
+
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
+
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
+
+    Raises:
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
+    """
+    if self._is_graph_network:
+      self.built = True
+      return
+
+    # If subclass network
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      if isinstance(input_shape, list):
+        # List of input shapes
+        x = [base_layer.generate_dummy_data_from_shape(shape)
+             for shape in input_shape]
+      else:
+        x = base_layer.generate_dummy_data_from_shape(input_shape)
+
+      kwargs = {}
+      num_call_args = len(tf_inspect.getargspec(self.call).args)
+      if self._expects_training_arg and num_call_args == 3:
+        # Has call signature of call(self, input, training)
+        kwargs['training'] = False
+      elif num_call_args > 2:
+        # Has invalid call signature of call(self, input, *args, **kwargs)
+        raise ValueError('Currently, you cannot build your model if it has '
+                         'positional or keyword arguments that are not '
+                         'inputs to the model, but are required for its '
+                         '`call` method. Instead, in order to instantiate '
+                         'and build your model, `call` your model on real '
+                         'tensor data with all expected call arguments.')
+
+      try:
+        self.call(x, **kwargs)
+      except (errors.InvalidArgumentError, TypeError):
+        raise ValueError('You cannot build your model by calling `build` '
+                         'if your layers do not support float type inputs. '
+                         'Instead, in order to instantiate and build your '
+                         'model, `call` your model on real tensor data (of '
+                         'the correct dtype).')
+
+    if self._layers:
+      self._track_layers(self._layers)
+    if self.layers:
+      for layer in self.layers:
+        if not layer.built:
+          raise ValueError('Layer: {} was not built in your model. Calling '
+                           '`build` manually on a subclassed model is only '
+                           'allowed for models with a static topology. '
+                           'In this case, you can build your model by '
+                           'calling it on real tensor data.'.format(layer))
+    self.built = True
+
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
 
@@ -779,6 +855,8 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
+      if context.executing_eagerly():
+        return super(Network, self).compute_output_shape(input_shape)
       raise NotImplementedError
 
     if isinstance(input_shape, list):
@@ -1423,13 +1501,9 @@ class Network(base_layer.Layer):
             'load_weights).')
       if not context.executing_eagerly():
         session = backend.get_session()
-        finalizer = functools.partial(status.run_restore_ops, session=session)
-        if self.built:
-          finalizer()
-        else:
-          # Hold on to this status object until the network is built (for
-          # subclassed Models). Then we'll run restore ops if necessary.
-          self._in_progress_restore_finalizer = finalizer
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        checkpointable_utils.streaming_restore(status=status, session=session)
       return status
     if h5py is None:
       raise ImportError(
@@ -1447,14 +1521,6 @@ class Network(base_layer.Layer):
       else:
         saving.load_weights_from_hdf5_group(f, self.layers)
 
-  def _post_build_cleanup(self):
-    super(Network, self)._post_build_cleanup()
-    if self._in_progress_restore_finalizer is not None:
-      # Runs queued restore operations left over from load_weights when graph
-      # building.
-      self._in_progress_restore_finalizer()
-      self._in_progress_restore_finalizer = None
-
   def _updated_config(self):
     """Util shared between different serialization methods.
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 030328f2a66f0ec406ac271aecfbf2dbebf22f5f..e029e614e04be299f22714fac0fd8b2cd93c0e92 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -722,18 +722,23 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         self.assertEqual(len(graph.get_operations()), op_count)
 
   def _weight_loading_test_template(self, make_model_fn):
-    with self.test_session() as session:
+    with self.test_session():
       model = make_model_fn()
+      model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
       temp_dir = self.get_temp_dir()
       prefix = os.path.join(temp_dir, 'ckpt')
+      train_x = np.random.random((3, 2))
+      train_y = np.random.random((3,))
+      x = constant_op.constant(train_x, dtype=dtypes.float32)
 
-      x = constant_op.constant(np.random.random((3, 2)), dtype=dtypes.float32)
-      executing_eagerly = context.executing_eagerly()
-      ref_y_tensor = model(x)
-      if not executing_eagerly:
-        session.run([v.initializer for v in model.variables])
-      ref_y = self.evaluate(ref_y_tensor)
+      model.train_on_batch(train_x, train_y)
       model.save_weights(prefix, save_format='tf')
+      ref_y_before_train = model.predict(train_x)
+      model.train_on_batch(train_x, train_y)
+      ref_y_after_train = model.predict(train_x)
       for v in model.variables:
         self.evaluate(
             v.assign(random_ops.random_normal(shape=array_ops.shape(v))))
@@ -741,16 +746,27 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
       self.addCleanup(shutil.rmtree, temp_dir)
 
       model.load_weights(prefix)
-      y = self.evaluate(model(x))
-      self.assertAllClose(ref_y, y)
+      self.assertAllClose(ref_y_before_train, self.evaluate(model(x)))
 
       # Test restore-on-create if this is a subclassed Model (graph Networks
       # will have already created their variables).
       load_model = make_model_fn()
       load_model.load_weights(prefix)
-      restore_on_create_y_tensor = load_model(x)
-      restore_on_create_y = self.evaluate(restore_on_create_y_tensor)
-      self.assertAllClose(ref_y, restore_on_create_y)
+      self.assertAllClose(
+          ref_y_before_train,
+          self.evaluate(load_model(x)))
+      load_model = make_model_fn()
+      load_model.load_weights(prefix)
+      # We need to run some of the restore ops for predict(), but not all
+      # variables have been created yet (optimizer slot variables). Tests
+      # incremental restore.
+      load_model.predict(train_x)
+      load_model.compile(
+          loss='mse',
+          optimizer=training_module.RMSPropOptimizer(0.1),
+          metrics=['acc'])
+      load_model.train_on_batch(train_x, train_y)
+      self.assertAllClose(ref_y_after_train, self.evaluate(load_model(x)))
 
   @test_util.run_in_graph_and_eager_modes
   def test_weight_loading_graph_model(self):
@@ -858,5 +874,6 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         SubclassedModel, SubclassedModelRestore,
         _restore_init_fn)
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 371504a503168e7443895bb22a57126b274da226..41cdfda660e69f41e4f3d15e2e61ac8f45654436 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -213,13 +213,31 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self.build()
 
-  @checkpointable.no_automatic_dependency_tracking
   def build(self, input_shape=None):
-    if input_shape and not self.inputs:
-      batch_shape = tuple(input_shape)
+    self._set_inputs_and_outputs(input_shape=input_shape)
+
+  def symbolic_set_inputs(self, inputs):
+    self._set_inputs_and_outputs(tensor=inputs)
+
+  @checkpointable.no_automatic_dependency_tracking
+  def _set_inputs_and_outputs(self, input_shape=None, tensor=None):
+    """Set model's input and output specs based on the input received.
+
+    If `tensor` is provided, `input_shape` is not required.
+
+    Args:
+      input_shape: Optional shape of input.
+      tensor: Optional existing tensor to wrap into the `Input` layer.
+    """
+    if not self.inputs:
       dtype = K.floatx()
-      x = Input(
-          batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
+      if tensor is not None:
+        batch_shape = (None,) + tuple(tensor.get_shape().as_list()[1:])
+        x = Input(dtype=dtype, name=self.name + '_input', tensor=tensor)
+      elif input_shape is not None:
+        batch_shape = tuple(input_shape)
+        x = Input(
+            batch_shape=batch_shape, dtype=dtype, name=self.name + '_input')
       self.inputs = [x]
       for layer in self._layers:
         x = layer(x)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 0f54e29cee38bd12d691b03ae98d3e578b7ff907..4f4adca33344dddc6e9c92cda94fff7289b35302 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -104,9 +103,6 @@ class TestSequential(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
-    if not context.executing_eagerly():
-      # TODO(psv/fchollet): Add support for this use case in graph mode.
-      return
     num_hidden = 5
     input_dim = 3
     num_classes = 2
@@ -136,6 +132,48 @@ class TestSequential(test.TestCase):
                      [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
 
+  def test_training_and_eval_methods_on_symbolic_tensors(self):
+    with self.test_session():
+
+      def create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(4, activation='softmax'))
+
+        model.compile(
+            optimizer=rmsprop.RMSPropOptimizer(1e-3),
+            loss='categorical_crossentropy',
+            metrics=['accuracy'])
+        return model
+
+      inputs = keras.backend.zeros(shape=(10, 3))
+      targets = keras.backend.zeros(shape=(10, 4))
+
+      model = create_model()
+      model.fit(inputs, targets, epochs=10, steps_per_epoch=30)
+
+      model = create_model()
+      model.evaluate(inputs, targets, steps=2, verbose=0)
+
+      model = create_model()
+      model.predict(inputs, steps=2)
+
+      model = create_model()
+      model.train_on_batch(inputs, targets)
+
+      model = create_model()
+      model.test_on_batch(inputs, targets)
+
+      model = create_model()
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          validation_data=(inputs, targets),
+          validation_steps=2)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index 3eb69bd7f3d42f5cd8d6cc6d2d32cc9eb808d9a4..34f74db6ef17b2ec72c6cfe0da9a8efb28f25c38 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -110,7 +110,6 @@ class TopologyConstructionTest(test.TestCase):
     layer = keras.layers.BatchNormalization()
     _ = layer.apply(x1)
 
-    print('BN updates', layer._updates)
     self.assertEqual(len(layer.updates), 2)
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
@@ -960,9 +959,6 @@ class DeferredModeTest(test.TestCase):
       def call(self, inputs):
         return inputs[0] + inputs[1]
 
-      def compute_output_shape(self, input_shape):
-        return input_shape[0]
-
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = keras.layers.Dense(2)(c)
 
@@ -978,6 +974,101 @@ class DeferredModeTest(test.TestCase):
       self.assertEqual(outputs[1].shape.as_list(), [10, 2])
 
 
+class DefaultShapeInferenceBehaviorTest(test.TestCase):
+
+  def _testShapeInference(self, model, input_shape, expected_output_shape):
+    input_value = np.random.random(input_shape)
+    output_value = model.predict(input_value)
+    self.assertEqual(output_value.shape, expected_output_shape)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSingleInputCase(self):
+
+    class LayerWithOneInput(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs):
+        return keras.backend.dot(inputs, self.w)
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    layer = LayerWithOneInput()
+
+    if context.executing_eagerly():
+      self.assertEqual(
+          layer.compute_output_shape((None, 3)).as_list(), [None, 4])
+      # As a side-effect, compute_output_shape builds the layer.
+      self.assertTrue(layer.built)
+      # We can still query the layer's compute_output_shape with compatible
+      # input shapes.
+      self.assertEqual(
+          layer.compute_output_shape((6, 3)).as_list(), [6, 4])
+
+    outputs = layer(inputs)
+    model = keras.Model(inputs, outputs)
+    self._testShapeInference(model, (2, 3), (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testMultiInputOutputCase(self):
+
+    class MultiInputOutputLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs):
+        a = keras.backend.dot(inputs[0], self.w)
+        b = a + inputs[1]
+        return [a, b]
+
+    input_a = input_layer_lib.Input(shape=(3,))
+    input_b = input_layer_lib.Input(shape=(4,))
+    output_a, output_b = MultiInputOutputLayer()([input_a, input_b])
+    model = keras.Model([input_a, input_b], [output_a, output_b])
+    output_a_val, output_b_val = model.predict(
+        [np.random.random((2, 3)), np.random.random((2, 4))])
+    self.assertEqual(output_a_val.shape, (2, 4))
+    self.assertEqual(output_b_val.shape, (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testTrainingArgument(self):
+
+    class LayerWithTrainingArg(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs, training):
+        return keras.backend.dot(inputs, self.w)
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    outputs = LayerWithTrainingArg()(inputs, training=False)
+    model = keras.Model(inputs, outputs)
+    self._testShapeInference(model, (2, 3), (2, 4))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testUnsupportedSignature(self):
+
+    class LayerWithAdditionalArg(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.w = array_ops.ones(shape=(3, 4))
+
+      def call(self, inputs, some_arg):
+        return keras.backend.dot(inputs, self.w) + some_arg
+
+    inputs = input_layer_lib.Input(shape=(3,))
+    if context.executing_eagerly():
+      with self.assertRaises(NotImplementedError):
+        outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
+    else:
+      # Works with graph mode because the graph of ops is built together with
+      # the graph of layers.
+      outputs = LayerWithAdditionalArg()(inputs, some_arg=0)
+      _ = keras.Model(inputs, outputs)
+
+
 class GraphUtilsTest(test.TestCase):
 
   def testGetReachableFromInputs(self):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index fbc2a11eda2d33dbe4a4926eed3c9e35789782ea..0fe14e99e068339c5ae9a11fe0fbbba3f205a043 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,13 +24,11 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training_arrays
@@ -39,7 +37,6 @@ from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -372,21 +369,14 @@ class Model(Network):
               'sample_weight_mode dictionary: "' + name + '". '
               'Only expected the following keys: ' + str(self.output_names))
       for i, name in enumerate(self.output_names):
-        if i in skip_target_weighing_indices:
-          weight = None
-          sample_weight_modes.append(None)
-        else:
-          if name not in sample_weight_mode:
-            raise ValueError(
-                'Output "' + name + '" missing from sample_weight_modes '
-                'dictionary')
-          if sample_weight_mode.get(name) == 'temporal':
-            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
-            sample_weight_modes.append('temporal')
-          else:
-            weight = K.placeholder(ndim=1, name=name + 'sample_weights')
-            sample_weight_modes.append(None)
+        if (i not in skip_target_weighing_indices and
+            name not in sample_weight_mode):
+          raise ValueError('Output "' + name +
+                           '" missing from sample_weight_modes dictionary')
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode.get(name), name, i)
         sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     elif isinstance(sample_weight_mode, list):
       if len(sample_weight_mode) != len(self.outputs):
         raise ValueError('When passing a list as sample_weight_mode, '
@@ -394,36 +384,17 @@ class Model(Network):
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed '
                          'sample_weight_mode=' + str(sample_weight_mode))
-      for i in range(len(self.output_names)):
-        if i in skip_target_weighing_indices:
-          weight = None
-          sample_weight_modes.append(None)
-        else:
-          mode = sample_weight_mode[i]
-          name = self.output_names[i]
-          if mode == 'temporal':
-            weight = K.placeholder(ndim=2, name=name + '_sample_weights')
-            sample_weight_modes.append('temporal')
-          else:
-            weight = K.placeholder(ndim=1, name=name + '_sample_weights')
-            sample_weight_modes.append(None)
+      for i, name in enumerate(self.output_names):
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode[i], name, i)
         sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     else:
       for i, name in enumerate(self.output_names):
-        if i in skip_target_weighing_indices:
-          sample_weight_modes.append(None)
-          sample_weights.append(None)
-        else:
-          if sample_weight_mode == 'temporal':
-            sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([[1.]], dtype=K.floatx()),
-                shape=[None, None], name=name + '_sample_weights'))
-            sample_weight_modes.append('temporal')
-          else:
-            sample_weights.append(array_ops.placeholder_with_default(
-                constant_op.constant([1.], dtype=K.floatx()),
-                shape=[None], name=name + '_sample_weights'))
-            sample_weight_modes.append(None)
+        weight, mode = training_utils.get_output_sample_weight_and_mode(
+            skip_target_weighing_indices, sample_weight_mode, name, i)
+        sample_weights.append(weight)
+        sample_weight_modes.append(mode)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
     for i in range(len(self.outputs)):
@@ -486,43 +457,21 @@ class Model(Network):
         weights = sample_weights[i]
         output_metrics = nested_metrics[i]
         output_weighted_metrics = nested_weighted_metrics[i]
+        output_shape = self.outputs[i].get_shape().as_list()
+        loss_fn = self.loss_functions[i]
 
-        def handle_metrics(metrics, weights=None):
+        def handle_metrics(metrics, output_shape, loss_fn, weights=None):
+          """Invokes metric functions for the output."""
 
           for metric in metrics:
-            if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-              # custom handling of accuracy/crossentropy
-              # (because of class mode duality)
-              output_shape = self.outputs[i].get_shape().as_list()
-              if (output_shape[-1] == 1 or
-                  self.loss_functions[i] == losses.binary_crossentropy):
-                # case: binary accuracy/crossentropy
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.binary_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.binary_crossentropy
-              elif self.loss_functions[
-                  i] == losses.sparse_categorical_crossentropy:
-                # case: categorical accuracy/crossentropy with sparse targets
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.sparse_categorical_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.sparse_categorical_crossentropy
-              else:
-                # case: categorical accuracy/crossentropy
-                if metric in ('accuracy', 'acc'):
-                  metric_fn = metrics_module.categorical_accuracy
-                elif metric in ('crossentropy', 'ce'):
-                  metric_fn = metrics_module.categorical_crossentropy
-              weighted_metric_fn = training_utils.weighted_masked_objective(
-                  metric_fn)
-            else:
-              metric_fn = metrics_module.get(metric)
-              weighted_metric_fn = training_utils.weighted_masked_objective(
-                  metric_fn)
-            metric_name = training_utils.get_base_metric_name(
+            metric_fn = training_utils.get_metric_function(
+                metric, output_shape=output_shape, loss_fn=loss_fn)
+            metric_name = training_utils.get_metric_name(
                 metric, weighted=weights is not None)
+
             with K.name_scope(metric_name):
+              weighted_metric_fn = training_utils.weighted_masked_objective(
+                  metric_fn)
               metric_result = weighted_metric_fn(
                   y_true, y_pred, weights=weights, mask=masks[i])
 
@@ -536,8 +485,9 @@ class Model(Network):
               self.stateful_metric_functions.append(metric_fn)
               self.metrics_updates += metric_fn.updates
 
-        handle_metrics(output_metrics)
-        handle_metrics(output_weighted_metrics, weights=weights)
+        handle_metrics(output_metrics, output_shape, loss_fn)
+        handle_metrics(
+            output_weighted_metrics, output_shape, loss_fn, weights=weights)
 
     # Prepare gradient updates and state updates.
     self.total_loss = total_loss
@@ -607,7 +557,6 @@ class Model(Network):
             updates=updates,
             name='train_function',
             **self._function_kwargs)
-    self._post_build_cleanup()
 
   def _make_test_function(self):
     if not hasattr(self, 'test_function'):
@@ -625,7 +574,6 @@ class Model(Network):
           updates=self.state_updates + self.metrics_updates,
           name='test_function',
           **self._function_kwargs)
-    self._post_build_cleanup()
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
@@ -644,7 +592,6 @@ class Model(Network):
           updates=self.state_updates,
           name='predict_function',
           **kwargs)
-    self._post_build_cleanup()
 
   def _get_iterator_get_next_tensors(self, iterator):
     get_next_op = self._iterator_get_next.get(iterator, None)
@@ -991,10 +938,14 @@ class Model(Network):
         inputs = inputs[0]
 
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+        if context.executing_eagerly():
+          input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+          self.build(input_shape=input_shape)
+        else:
+          self.symbolic_set_inputs(inputs)
       else:
         input_shape = (None,) + inputs.shape[1:]
-      self.build(input_shape=input_shape)
+        self.build(input_shape=input_shape)
     elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 397de429851f356f5d0e01771ec2a784b4c1283d..b1149ab5b5b62c1cbe94c2515dde743dcded05cd 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -30,35 +30,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
 
 
-def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
-  if metric == 'accuracy' or metric == 'acc':
-    # custom handling of accuracy
-    # (because of class mode duality)
-    output_shape = internal_output_shapes
-    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
-      # case: binary accuracy
-      acc_fn = metrics_module.binary_accuracy
-    elif loss_func == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
-      acc_fn = metrics_module.sparse_categorical_accuracy
-    else:
-      acc_fn = metrics_module.categorical_accuracy
-
-    metric_name = 'acc'
-    return metric_name, acc_fn
-  else:
-    metric_fn = metrics_module.get(metric)
-    metric_name = metric_fn.__name__
-    return metric_name, metric_fn
-
-
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
@@ -74,9 +50,8 @@ def _eager_metrics_fn(model, outputs, targets):
       targets: The predictions or targets of the given model.
 
   Returns:
-      Returns the metric names and metric results for each output of the model.
+      Returns the metric results for each output of the model.
   """
-  metric_names = []
   metric_results = []
   if not isinstance(outputs, list):
     outputs = [outputs]
@@ -87,18 +62,15 @@ def _eager_metrics_fn(model, outputs, targets):
   for i in range(len(model.outputs)):
     output_metrics = model.nested_metrics[i]
     for nested_output_metric in output_metrics:
-      metric_name, metric_fn = _get_metrics_info(
+      metric_fn = training_utils.get_metric_function(
           nested_output_metric, backend.int_shape(model.outputs[i]),
           model.loss_functions[i])
-
-      if len(model.output_names) > 1:
-        metric_name = model.output_names[i] + '_' + metric_name
-        if metric_name not in model.metrics_names:
-          model.metrics_names.append(metric_name)
+      # weighted metrics are not supported in eager mode
+      metric_name = training_utils.get_metric_name(
+          nested_output_metric, weighted=False)
 
       with backend.name_scope(metric_name):
         metric_result = metric_fn(targets[i], outputs[i])
-        metric_names.append(metric_name)
         metric_results.append(backend.mean(metric_result))
 
   return metric_results
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index c621a88fb3ff63436649bc5c2d5e2a4c4234eccf..129441d159f3b6be09fae60ebac2a82b6c319fbe 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -731,6 +731,54 @@ class LossWeightingTest(test.TestCase):
         model.fit(x_np, [y_np, y_np], epochs=1,
                   sample_weight={'1': bad_w_np})
 
+  def test_default_sample_weight(self):
+    """Verifies that fit works without having to set sample_weight."""
+
+    num_classes = 5
+    input_dim = 5
+    timesteps = 3
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.TimeDistributed(
+              keras.layers.Dense(num_classes),
+              input_shape=(timesteps, input_dim)))
+
+      x = np.random.random((10, timesteps, input_dim))
+      y = np.random.random((10, timesteps, num_classes))
+
+      # sample_weight_mode is a list and mode value is None
+      model.compile(loss='mse', optimizer='rmsprop', sample_weight_mode=[None])
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a list and mode value is `temporal`
+      model.compile(
+          loss='mse', optimizer='rmsprop', sample_weight_mode=['temporal'])
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a dict and mode value is None
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          sample_weight_mode={'time_distributed': None})
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a dict and mode value is `temporal`
+      model.compile(
+          loss='mse',
+          optimizer='rmsprop',
+          sample_weight_mode={'time_distributed': 'temporal'})
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a not a list/dict and mode value is None
+      model.compile(loss='mse', optimizer='rmsprop', sample_weight_mode=None)
+      model.fit(x, y, epochs=1, batch_size=10)
+
+      # sample_weight_mode is a not a list/dict and mode value is `temporal`
+      model.compile(
+          loss='mse', optimizer='rmsprop', sample_weight_mode='temporal')
+      model.fit(x, y, epochs=1, batch_size=10)
+
 
 class LossMaskingTest(test.TestCase):
 
@@ -767,6 +815,22 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
+class LearningPhaseTest(test.TestCase):
+
+  def test_empty_model_no_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      self.assertFalse(model.uses_learning_phase)
+
+  def test_dropout_has_learning_phase(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_dim=3))
+      model.add(keras.layers.Dropout(0.5))
+      model.add(keras.layers.Dense(2))
+      self.assertTrue(model.uses_learning_phase)
+
+
 class TestDynamicTrainability(test.TestCase):
 
   def test_trainable_warning(self):
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index dbbc87daf96e794b2f163f74bfd2cd4b9883e936..f2cd9c89dafe4553cdd6e6137a62c254ad54f25c 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -700,17 +702,16 @@ def populate_metric_names(model):
   for i in range(len(model.outputs)):
     metrics = model.nested_metrics[i]
     for metric in metrics:
-      base_metric_name = get_base_metric_name(metric)
+      base_metric_name = get_metric_name(metric)
       add_metric_name(model, base_metric_name, i)
 
 
-def get_base_metric_name(metric, weighted=False):
-  """Returns the metric name given the metric function.
+def get_metric_name(metric, weighted=False):
+  """Returns the metric name corresponding to the given metric input.
 
   Arguments:
       metric: Metric function name or reference.
-      weighted: Boolean indicating if the metric for which we are adding
-          names is weighted.
+      weighted: Boolean indicating if the given metric is weighted.
 
   Returns:
       a metric name.
@@ -734,6 +735,36 @@ def get_base_metric_name(metric, weighted=False):
   return metric_name
 
 
+def get_metric_function(metric, output_shape=None, loss_fn=None):
+  """Returns the metric function corresponding to the given metric input.
+
+  Arguments:
+      metric: Metric function name or reference.
+      output_shape: The shape of the output that this metric
+          will be calculated for.
+      loss_fn: The loss function used.
+
+  Returns:
+      The metric function.
+  """
+  if metric in ['accuracy', 'acc']:
+    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
+      return metrics_module.binary_accuracy  # case: binary accuracy
+    elif loss_fn == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      return metrics_module.sparse_categorical_accuracy
+    return metrics_module.categorical_accuracy  # case: categorical accuracy
+  elif metric in ['crossentropy', 'ce']:
+    if output_shape[-1] == 1 or loss_fn == losses.binary_crossentropy:
+      return metrics_module.binary_crossentropy  # case: binary cross-entropy
+    elif loss_fn == losses.sparse_categorical_crossentropy:
+      # case: categorical cross-entropy with sparse targets
+      return metrics_module.sparse_categorical_crossentropy
+    # case: categorical cross-entropy
+    return metrics_module.categorical_crossentropy
+  return metrics_module.get(metric)
+
+
 def add_metric_name(model, metric_name, index):
   """Makes the metric name unique and adds it to the model's metric name list.
 
@@ -856,3 +887,25 @@ def cast_if_floating_dtype(x):
         for val in x
     ]
   return math_ops.cast(x, dtype=K.floatx()) if x.dtype.is_floating else x
+
+
+def get_output_sample_weight_and_mode(skip_target_weighing_indices,
+                                      sample_weight_mode, output_name,
+                                      output_index):
+  """Returns the sample weight and weight mode for a single output."""
+  if output_index in skip_target_weighing_indices:
+    return None, None
+
+  if sample_weight_mode == 'temporal':
+    default_value = [[1.]]
+    shape = [None, None]
+    mode = 'temporal'
+  else:
+    default_value = [1.]
+    shape = [None]
+    mode = None
+  weight = array_ops.placeholder_with_default(
+      constant_op.constant(default_value, dtype=K.floatx()),
+      shape=shape,
+      name=output_name + '_sample_weights')
+  return weight, mode
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index eba10da6f3ce1367f4cb0180d16efdc5913fcddc..61ab69c16f14b8d734a306ab3ad18c73eaf160ca 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -284,6 +284,13 @@ class Softmax(Layer):
 class ReLU(Layer):
   """Rectified Linear Unit activation function.
 
+  With default values, it returns element-wise `max(x, 0)`.
+
+  Otherwise, it follows:
+  `f(x) = max_value` for `x >= max_value`,
+  `f(x) = x` for `threshold <= x < max_value`,
+  `f(x) = negative_slope * (x - threshold)` otherwise.
+
   Input shape:
       Arbitrary. Use the keyword argument `input_shape`
       (tuple of integers, does not include the samples axis)
@@ -294,21 +301,39 @@ class ReLU(Layer):
 
   Arguments:
       max_value: float >= 0. Maximum activation value.
+      negative_slope: float >= 0. Negative slope coefficient.
+      threshold: float. Threshold value for thresholded activation.
   """
 
-  def __init__(self, max_value=None, **kwargs):
+  def __init__(self, max_value=None, negative_slope=0, threshold=0, **kwargs):
     super(ReLU, self).__init__(**kwargs)
-    self.support_masking = True
-    self.max_value = K.cast_to_floatx(max_value)
-    if self.max_value < 0.:
+    if max_value is not None and max_value < 0.:
       raise ValueError('max_value of Relu layer '
                        'cannot be negative value: ' + str(max_value))
+    if negative_slope < 0.:
+      raise ValueError('negative_slope of Relu layer '
+                       'cannot be negative value: ' + str(negative_slope))
+
+    self.support_masking = True
+    self.max_value = K.cast_to_floatx(max_value)
+    self.negative_slope = K.cast_to_floatx(negative_slope)
+    self.threshold = K.cast_to_floatx(threshold)
 
   def call(self, inputs):
-    return activations.relu(inputs, max_value=self.max_value)
+    # alpha is used for leaky relu slope in activations instead of
+    # negative_slope.
+    return activations.relu(
+        inputs,
+        alpha=self.negative_slope,
+        max_value=self.max_value,
+        threshold=self.threshold)
 
   def get_config(self):
-    config = {'max_value': self.max_value}
+    config = {
+        'max_value': self.max_value,
+        'negative_slope': self.negative_slope,
+        'threshold': self.threshold
+    }
     base_config = super(ReLU, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 9e1f15b1bc508d8be0a2c0190d07eb1c2bed95c4..53c1baa2bbd4367eb09d5bc792de9f20baa981ef 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -75,6 +75,14 @@ class AdvancedActivationsTest(test.TestCase):
         testing_utils.layer_test(keras.layers.ReLU,
                                  kwargs={'max_value': -10},
                                  input_shape=(2, 3, 4))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'negative_slope of Relu layer cannot be negative value: -2'):
+      with self.test_session():
+        testing_utils.layer_test(
+            keras.layers.ReLU,
+            kwargs={'negative_slope': -2},
+            input_shape=(2, 3, 4))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 8fd970239f205031954c728474abdf10ea80e99e..2ed0aa8f2684009251e61c92a1ac167f1ba2f0af 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -220,7 +220,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
         self.assertNotEqual(out4.max(), out5.max())
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
@@ -301,7 +301,7 @@ class CuDNNTest(test.TestCase, parameterized.TestCase):
     os.remove(fname)
 
   @parameterized.named_parameters(
-      *testing_utils.generate_combinations_with_testcase_name(
+      *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
   def test_load_weights_between_noncudnn_rnn_time_distributed(self, rnn_type,
                                                               to_cudnn):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 72e15763cb9bc995a43f521d0a7ddca063ff8a19..7d8b1fec45cc53fa0a5fc0da269772fbf16653ce 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 
 from abc import ABCMeta
 from abc import abstractmethod
+
+import types
 import six
 
 from tensorflow.python.eager import context
@@ -58,7 +60,14 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 
 
-def update_state(update_state_fn):
+def check_is_tensor_or_operation(x, name):
+  """Raises type error if the given input is not a tensor or operation."""
+  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
+    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
+        name, x))
+
+
+def update_state_wrapper(update_state_fn):
   """Decorator to wrap metric `update_state()` with `defun()`, `add_update()`.
 
   Args:
@@ -70,7 +79,7 @@ def update_state(update_state_fn):
       executed to update the metric state with the given inputs.
   """
 
-  def decorated(*args, **kwargs):
+  def decorated(metric_obj, *args, **kwargs):
     """Decorated function with `defun()` and `add_update()`."""
 
     # Converting update_state_fn() into a graph function, so that
@@ -79,14 +88,15 @@ def update_state(update_state_fn):
     defuned_update_state_fn = function.defun(update_state_fn)
     update_op = defuned_update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
-      metric_obj = args[0]
       metric_obj.add_update(update_op, inputs=True)
+      check_is_tensor_or_operation(
+          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
 
 
-def result(result_fn):
+def result_wrapper(result_fn):
   """Decorator to wrap metric `result()` function in `merge_call()`.
 
   Result computation is an idempotent operation that simply calculates the
@@ -104,26 +114,29 @@ def result(result_fn):
     The metric result tensor.
   """
 
-  def decorated(*args):
+  def decorated(metric_obj, *args):
     """Decorated function with merge_call."""
     tower_context = distribute_lib.get_tower_context()
     if tower_context is None:  # if in cross tower context already
-      return result_fn()
-
-    # TODO(psv): Test distribution of metrics using different distribution
-    # strategies.
-
-    # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
-    # with distribution object as the first parameter. We create a wrapper here
-    # so that the result function need not have that parameter.
-    def merge_fn_wrapper(distribution, merge_fn, *args):
-      # We will get `PerDevice` merge function. Taking the first one as all are
-      # identical copies of the function that we had passed below.
-      return distribution.unwrap(merge_fn)[0](*args)
-
-    # Wrapping result in merge_call. merge_call is used when we want to leave
-    # tower mode and compute a value in cross tower mode.
-    return tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
+      result_t = result_fn(*args)
+    else:
+      # TODO(psv): Test distribution of metrics using different distribution
+      # strategies.
+
+      # Creating a wrapper for merge_fn. merge_call invokes the given merge_fn
+      # with distribution object as the first parameter. We create a wrapper
+      # here so that the result function need not have that parameter.
+      def merge_fn_wrapper(distribution, merge_fn, *args):
+        # We will get `PerDevice` merge function. Taking the first one as all
+        # are identical copies of the function that we had passed below.
+        return distribution.unwrap(merge_fn)[0](*args)
+
+      # Wrapping result in merge_call. merge_call is used when we want to leave
+      # tower mode and compute a value in cross tower mode.
+      result_t = tower_context.merge_call(merge_fn_wrapper, result_fn, *args)
+    check_is_tensor_or_operation(result_t,
+                                 'Metric {0}\'s result'.format(metric_obj.name))
+    return result_t
 
   return tf_decorator.make_decorator(result_fn, decorated)
 
@@ -246,24 +259,19 @@ class Metric(Layer):
   * `__init__()`: All state variables should be created in this method by
     calling `self.add_weight()` like: `self.var = self.add_weight(...)`
   * `update_state()`: Has all updates to the state variables like:
-    self.var.assign_add(...). Please decorate the function with:
-    @update_state: Converts `update_state()` into a graph function, so that
-    we can return a single op that performs all of the variable updates and
-    adds the update op to the metric layer.
+    self.var.assign_add(...).
   * `result()`: Computes and returns a value for the metric
-    from the state variables. Please decorate the function with:
-    @result: Wraps `result()` in a distribution strategy merge_call().
+    from the state variables.
 
   Example subclass implementation:
 
   ```
   class BinaryTruePositives(Metric):
-    def __init__(self, name='binary-true-positives', dtype=dtypes.float64):
+    def __init__(self, name='binary-true-positives', dtype=None):
       super(BinaryTruePositives, self).__init__(name=name, dtype=dtype)
       self.true_positives = self.add_weight(
           'true_positives', initializer=init_ops.zeros_initializer)
 
-    @update_state
     def update_state(self, y_true, y_pred, sample_weight=None):
       y_true = math_ops.cast(y_true, dtypes.bool)
       y_pred = math_ops.cast(y_pred, dtypes.bool)
@@ -278,17 +286,24 @@ class Metric(Layer):
         values = math_ops.multiply(values, sample_weight)
       state_ops.assign_add(self.true_positives, math_ops.reduce_sum(values))
 
-    @result
     def result(self):
       return array_ops.identity(self.true_positives)
   ```
   """
   __metaclass__ = ABCMeta
 
-  def __init__(self, name=None, dtype=dtypes.float64):
+  def __init__(self, name=None, dtype=None):
     super(Metric, self).__init__(name=name, dtype=dtype)
     self.stateful = True  # All metric layers are stateful.
     self.built = True
+    self._dtype = K.floatx() if dtype is None else dtypes.as_dtype(dtype).name
+
+  def __new__(cls, *args, **kwargs):
+    obj = super(Metric, cls).__new__(cls, *args, **kwargs)
+    obj.update_state = types.MethodType(
+        update_state_wrapper(obj.update_state), obj)
+    obj.result = types.MethodType(result_wrapper(obj.result), obj)
+    return obj
 
   def __call__(self, *args, **kwargs):
     """Accumulates statistics and then computes metric result value.
@@ -301,9 +316,9 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)
+    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
     with ops.control_dependencies([update_op]):
-      return self.result()
+      return self.result()  # pylint: disable=not-callable
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -318,10 +333,8 @@ class Metric(Layer):
   def update_state(self, *args, **kwargs):
     """Accumulates statistics for the metric.
 
-    Please decorate the function with:
-    @update_state: Converts `update_state()` into a graph function, so that
-    we can return a single op that performs all of the variable updates
-      This means:
+    Note: This function is executed as a graph function in graph mode.
+    This means:
       a) Operations on the same resource are executed in textual order.
          This should make it easier to do things like add the updated
          value of a variable to another, for example.
@@ -343,9 +356,6 @@ class Metric(Layer):
 
     Result computation is an idempotent operation that simply calculates the
     metric value using the state variables.
-
-    Please decorate the function with:
-    @result: Wraps `result()` in a distribution strategy merge_call().
     """
     NotImplementedError('Must be implemented in subclasses.')
 
@@ -380,7 +390,13 @@ class Mean(Metric):
   Use `sample_weight` of 0 to mask values.
   """
 
-  def __init__(self, name='mean', dtype=dtypes.float64):
+  def __init__(self, name='mean', dtype=None):
+    """Creates a `Mean` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
     super(Mean, self).__init__(name=name, dtype=dtype)
     # Create new state variables
     self.total = self.add_weight(
@@ -388,7 +404,6 @@ class Mean(Metric):
     self.count = self.add_weight(
         'count', initializer=init_ops.zeros_initializer)
 
-  @update_state
   def update_state(self, values, sample_weight=None):
     """Accumulates statistics for computing the mean.
 
@@ -418,14 +433,84 @@ class Mean(Metric):
     state_ops.assign_add(self.total, values)
     state_ops.assign_add(self.count, num_values)
 
-  @result
   def result(self):
     return _safe_div(self.total, self.count)
 
 
+class MeanMetricWrapper(Mean):
+  """Wraps a stateless metric function with the Mean metric."""
+
+  def __init__(self, fn, name=None, dtype=None, **kwargs):
+    """Creates a `MeanMetricWrapper` instance.
+
+    Args:
+      fn: The metric function to wrap, with signature
+        `fn(y_true, y_pred, **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
+    super(MeanMetricWrapper, self).__init__(name=name, dtype=dtype)
+    self._fn = fn
+    self._fn_kwargs = kwargs
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be
+        a `Tensor` whose rank is either 0, or the same rank as `y_true`,
+        and must be broadcastable to `y_true`.
+    """
+    y_true = math_ops.cast(y_true, self._dtype)
+    y_pred = math_ops.cast(y_pred, self._dtype)
+    y_pred, y_true, sample_weight = _squeeze_or_expand_dimensions(
+        y_pred, y_true, sample_weight)
+
+    matches = self._fn(y_true, y_pred, **self._fn_kwargs)
+    super(MeanMetricWrapper, self).update_state(
+        matches, sample_weight=sample_weight)
+
+  def get_config(self):
+    config = self._fn_kwargs
+    base_config = super(MeanMetricWrapper, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class BinaryAccuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+  """
+
+  def __init__(self, name='binary-accuracy', dtype=None, threshold=0.5):
+    """Creates a `BinaryAccuracy` instance.
+
+    Args:
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      threshold: (Optional) Float representing the threshold for deciding
+      whether prediction values are 1 or 0.
+    """
+    super(BinaryAccuracy, self).__init__(
+        binary_accuracy, name, dtype=dtype, threshold=threshold)
+
+
 @tf_export('keras.metrics.binary_accuracy')
-def binary_accuracy(y_true, y_pred):
-  return K.mean(math_ops.equal(y_true, math_ops.round(y_pred)), axis=-1)
+def binary_accuracy(y_true, y_pred, threshold=0.5):
+  threshold = math_ops.cast(threshold, y_pred.dtype)
+  y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
+  return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
 
 
 @tf_export('keras.metrics.categorical_accuracy')
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 6d8269f34dff2bf016f99eca62c6f92ec2b3e687..d5833797080c5a40ddd4e7f905a2641d80f66425 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -196,7 +196,7 @@ class KerasMetricsTest(test.TestCase):
     # check config
     self.assertEqual(m.name, 'my_mean')
     self.assertTrue(m.stateful)
-    self.assertEqual(m.dtype, dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float32)
     self.assertEqual(len(m.variables), 2)
     self.evaluate(variables.global_variables_initializer())
 
@@ -212,7 +212,7 @@ class KerasMetricsTest(test.TestCase):
     # check update_state() and result() + state accumulation + tensor input
     update_op = m.update_state(ops.convert_n_to_tensor([1, 5]))
     self.evaluate(update_op)
-    self.assertEqual(self.evaluate(m.result()), 106 / 3)
+    self.assertAlmostEqual(self.evaluate(m.result()), 106 / 3, 2)
     self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
     self.assertEqual(self.evaluate(m.count), 3)
 
@@ -223,7 +223,8 @@ class KerasMetricsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def test_mean_with_sample_weight(self):
-    m = metrics.Mean()
+    m = metrics.Mean(dtype=dtypes.float64)
+    self.assertEqual(m.dtype, dtypes.float64)
     self.evaluate(variables.global_variables_initializer())
 
     # check scalar weight
@@ -308,6 +309,94 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy(self):
+    acc_obj = metrics.BinaryAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.global_variables_initializer())
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [0]], [[1], [0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check y_pred squeeze
+    update_op = acc_obj.update_state([[1], [1]], [[[1]], [[0]]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertAlmostEqual(result, 0.75, 2)  # 3/4
+
+    # check y_true squeeze
+    result_t = acc_obj([[[1]], [[1]]], [[1], [0]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4/6
+
+    # check with sample_weight
+    result_t = acc_obj([[1], [1]], [[1], [0]], [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
+
+    # check incompatible shapes
+    with self.assertRaisesRegexp(ValueError,
+                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
+      acc_obj.update_state([1, 1], [1])
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_binary_accuracy_threshold(self):
+    acc_obj = metrics.BinaryAccuracy(threshold=0.7)
+    self.evaluate(variables.global_variables_initializer())
+    result_t = acc_obj([[1], [1], [0], [0]], [[0.9], [0.6], [0.4], [0.8]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.5, 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_result(self):
+
+    class InvalidResult(metrics.Metric):
+
+      def __init__(self, name='invalid-result', dtype=dtypes.float64):
+        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        pass
+
+      def result(self):
+        return 1
+
+    invalid_result_obj = InvalidResult()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
+    ):
+      invalid_result_obj.result()
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_update(self):
+
+    class InvalidUpdate(metrics.Metric):
+
+      def __init__(self, name='invalid-update', dtype=dtypes.float64):
+        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+
+      def update_state(self, *args, **kwargs):
+        return [1]
+
+      def result(self):
+        pass
+
+    invalid_update_obj = InvalidUpdate()
+    with self.assertRaisesRegexp(
+        TypeError,
+        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
+    ):
+      invalid_update_obj.update_state()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 3ac4852eff6910a9861ae959f990978cea33d595..3a153573f8d5b23d40ffcac6721550507100d16b 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -29,6 +29,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import data_structures
@@ -65,6 +67,22 @@ class SimpleTestModel(keras.Model):
     return self.dense2(x)
 
 
+class SimpleConvTestModel(keras.Model):
+
+  def __init__(self, num_classes=10):
+    super(SimpleConvTestModel, self).__init__(name='test_model')
+    self.num_classes = num_classes
+
+    self.conv1 = keras.layers.Conv2D(32, (3, 3), activation='relu')
+    self.flatten = keras.layers.Flatten()
+    self.dense1 = keras.layers.Dense(num_classes, activation='softmax')
+
+  def call(self, x):
+    x = self.conv1(x)
+    x = self.flatten(x)
+    return self.dense1(x)
+
+
 class MultiIOTestModel(keras.Model):
 
   def __init__(self, use_bn=False, use_dp=False, num_classes=(2, 3)):
@@ -162,9 +180,6 @@ def get_nested_model_3(input_dim, num_classes):
       x = self.dense2(x)
       return self.bn(x)
 
-    def compute_output_shape(self, input_shape):
-      return tensor_shape.TensorShape((input_shape[0], 5))
-
   test_model = Inner()
   x = test_model(x)
   outputs = keras.layers.Dense(num_classes)(x)
@@ -173,6 +188,213 @@ def get_nested_model_3(input_dim, num_classes):
 
 class ModelSubclassingTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_invalid_input_shape_build(self):
+    num_classes = 2
+    input_dim = 50
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'input shape is not one of the valid types'):
+      model.build(input_shape=tensor_shape.Dimension(input_dim))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_embed_dtype_with_subclass_build(self):
+    class Embedding(keras.layers.Layer):
+      """An Embedding layer."""
+
+      def __init__(self, vocab_size, embedding_dim, **kwargs):
+        super(Embedding, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.embedding_dim = embedding_dim
+
+      def build(self, _):
+        self.embedding = self.add_variable(
+            'embedding_kernel',
+            shape=[self.vocab_size, self.embedding_dim],
+            dtype=np.float32,
+            initializer=init_ops.random_uniform_initializer(-0.1, 0.1),
+            trainable=True)
+
+      def call(self, x):
+        return embedding_ops.embedding_lookup(self.embedding, x)
+
+    class EmbedModel(keras.Model):
+
+      def __init__(self, vocab_size, embed_size):
+        super(EmbedModel, self).__init__()
+        self.embed1 = Embedding(vocab_size, embed_size)
+
+      def call(self, inputs):
+        return self.embed1(inputs)
+
+    model = EmbedModel(100, 20)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'if your layers do not support float type inputs'):
+      model.build(input_shape=(35, 20))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_time_step_rnn_build(self):
+    dim = 4
+    timesteps = 1
+    batch_input_shape = (None, timesteps, dim)
+    units = 3
+
+    class SimpleRNNModel(keras.Model):
+
+      def __init__(self):
+        super(SimpleRNNModel, self).__init__()
+        self.lstm = keras.layers.LSTM(units)
+
+      def call(self, inputs):
+        return self.lstm(inputs)
+
+    model = SimpleRNNModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, timesteps, dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_subclass_build(self):
+    num_classes = 2
+    input_dim = 50
+    batch_size = None
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_single_io_dimension_subclass_build(self):
+    num_classes = 2
+    input_dim = tensor_shape.Dimension(50)
+    batch_size = tensor_shape.Dimension(None)
+
+    model = SimpleTestModel(num_classes=num_classes,
+                            use_dp=True,
+                            use_bn=True)
+
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(input_shape=(batch_size, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    model(array_ops.ones((32, input_dim)))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multidim_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = 32
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = (batch_size,) + input_shape
+    model.build(input_shape=batch_input_shape)
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones(batch_input_shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_tensorshape_io_subclass_build(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+    model(array_ops.ones((32,) + input_shape))
+
+  def test_subclass_save_model(self):
+    num_classes = 10
+    # Input size, e.g. image
+    batch_size = None
+    input_shape = (32, 32, 3)
+
+    model = SimpleConvTestModel(num_classes)
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    weights = model.get_weights()
+
+    tf_format_name = os.path.join(self.get_temp_dir(), 'ckpt')
+    model.save_weights(tf_format_name)
+    if h5py is not None:
+      hdf5_format_name = os.path.join(self.get_temp_dir(), 'weights.h5')
+      model.save_weights(hdf5_format_name)
+
+    model = SimpleConvTestModel(num_classes)
+    model.build(
+        input_shape=tensor_shape.TensorShape((batch_size,) + input_shape))
+    if h5py is not None:
+      model.load_weights(hdf5_format_name)
+      self.assertAllClose(weights, model.get_weights())
+    model.load_weights(tf_format_name)
+    self.assertAllClose(weights, model.get_weights())
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multi_io_subclass_build(self):
+    batch_size = None
+    num_samples = 1000
+    input_dim = 50
+    model = MultiIOTestModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    batch_input_shape = tensor_shape.TensorShape((batch_size, input_dim))
+    model.build(
+        input_shape=[batch_input_shape, batch_input_shape])
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+    x1 = array_ops.ones((num_samples, input_dim))
+    x2 = array_ops.ones((num_samples, input_dim))
+    model([x1, x2])
+
   @test_util.run_in_graph_and_eager_modes
   def test_single_io_workflow_with_np_arrays(self):
     num_classes = 2
@@ -750,6 +972,16 @@ class CustomCallModel(keras.Model):
       return combined
 
 
+class TrainingNoDefaultModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingNoDefaultModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -766,6 +998,32 @@ class CustomCallSignatureTests(test.TestCase):
     output = model(first, second=second, training=False)
     self.assertAllClose(expected_output, self.evaluate(output))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingNoDefaultModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_custom_call_kwargs_and_build(self):
+    first_input_shape = (2, 3)
+    second_input_shape = (2, 5)
+
+    model = CustomCallModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    with self.assertRaisesRegexp(
+        ValueError, 'cannot build your model if it has positional'):
+      model.build(input_shape=[first_input_shape, second_input_shape])
+
   @test_util.run_in_graph_and_eager_modes
   def test_inputs_in_signature(self):
 
@@ -829,14 +1087,9 @@ class CustomCallSignatureTests(test.TestCase):
 
   def test_training_no_default(self):
 
-    class TrainingNoDefault(keras.Model):
-
-      def call(self, x, training):
-        return x
-
     with context.graph_mode():
-      model = TrainingNoDefault()
-      arg = array_ops.ones([])
+      model = TrainingNoDefaultModel()
+      arg = array_ops.ones([1, 1])
       model(arg, True)
       six.assertCountEqual(self, [arg], model.inputs)
 
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 17aba7d86c236d9bb30d3a3376b3aac40b69e77d..6e8ee06ff53691c3f439114bd6c4745c03cf9a10 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import OrderedDict
 import numpy as np
 
 from tensorflow.python import keras
@@ -185,75 +184,3 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   # for further checks in the caller function
   return actual_output
 
-
-def _combine_named_parameters(**kwargs):
-  """Generate combinations based on its keyword arguments.
-
-  Two sets of returned combinations can be concatenated using +.  Their product
-  can be computed using `times()`.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  if not kwargs:
-    return [OrderedDict()]
-
-  sort_by_key = lambda k: k[0][0]
-  kwargs = OrderedDict(sorted(kwargs.items(), key=sort_by_key))
-  first = list(kwargs.items())[0]
-
-  rest = dict(list(kwargs.items())[1:])
-  rest_combined = _combine_named_parameters(**rest)
-
-  key = first[0]
-  values = first[1]
-  if not isinstance(values, list):
-    values = [values]
-
-  combinations = [
-      OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
-      for v in values
-      for combined in rest_combined
-  ]
-  return combinations
-
-
-def generate_combinations_with_testcase_name(**kwargs):
-  """Generate combinations based on its keyword arguments using combine().
-
-  This function calls combine() and appends a testcase name to the list of
-  dictionaries returned. The 'testcase_name' key is a required for named
-  parameterized tests.
-
-  Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`
-         or `option=the_only_possibility`.
-
-  Returns:
-    a list of dictionaries for each combination. Keys in the dictionaries are
-    the keyword argument names.  Each key has one value - one of the
-    corresponding keyword argument values.
-  """
-  combinations = _combine_named_parameters(**kwargs)
-  named_combinations = []
-  for combination in combinations:
-    assert isinstance(combination, OrderedDict)
-    name = ''.join([
-        '_{}_{}'.format(
-            ''.join(filter(str.isalnum, key)),
-            ''.join(filter(str.isalnum, str(value))))
-        for key, value in combination.items()
-    ])
-    named_combinations.append(
-        OrderedDict(
-            list(combination.items()) + [('testcase_name',
-                                          '_test{}'.format(name))]))
-
-  return named_combinations
-
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 838cf836f1784ed97373d87b9af8889aa4d40145..adf97569ab4446fdc23b7dc3c0e7d92a9a5b20ae 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -2845,7 +2845,6 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
     ],
     shard_count = 20,
-    tags = ["nomsan"],  # TODO(b/110990716) reenable
 )
 
 cuda_py_test(
@@ -3095,7 +3094,7 @@ tf_py_test(
 
 tf_py_test(
     name = "cond_v2_test",
-    size = "small",
+    size = "medium",
     srcs = ["cond_v2_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
@@ -3110,4 +3109,5 @@ tf_py_test(
         "//tensorflow/python:training",
     ],
     grpc_enabled = True,
+    tags = ["no_gpu"],  # TODO(b/111656070)
 )
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index ce0676990221fb441b99043083647f9d65722db8..1202c463e80d21b7cf88e5596cfc64eaa38ef8ba 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -115,6 +116,12 @@ class ArgMaxTest(test.TestCase):
         ans = op([1]).eval()
         self.assertAllEqual(ans, 0)
 
+  def testOutputEmpty(self):
+    with self.test_session():
+      for op in math_ops.argmin, math_ops.argmax:
+        ret = op(array_ops.zeros(shape=[1, 0, 2]), axis=-1).eval()
+        self.assertEqual(ret.shape, (1, 0))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 759db5d5f43a144150918446e6ce206b3095904f..97ce245fc835a90a83026802353646f9dc8720e5 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond_v2
@@ -35,10 +36,12 @@ from tensorflow.python.training import saver
 from tensorflow.python.util import compat
 
 
-class NewCondTest(test.TestCase):
+class CondV2Test(test.TestCase):
 
-  def _testCond(self, true_fn, false_fn, train_vals):
-    with self.test_session() as sess:
+  def _testCond(self, true_fn, false_fn, train_vals, feed_dict=None):
+    if not feed_dict:
+      feed_dict = {}
+    with self.test_session(graph=ops.get_default_graph()) as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
 
       expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
@@ -47,13 +50,17 @@ class NewCondTest(test.TestCase):
       expected_grad = gradients_impl.gradients(expected, train_vals)
       actual_grad = gradients_impl.gradients(actual, train_vals)
 
+      sess_run_args = {pred: True}
+      sess_run_args.update(feed_dict)
       expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
-          (expected, actual, expected_grad, actual_grad), {pred: True})
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+      sess_run_args = {pred: False}
+      sess_run_args.update(feed_dict)
       expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
-          (expected, actual, expected_grad, actual_grad), {pred: False})
+          (expected, actual, expected_grad, actual_grad), sess_run_args)
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
@@ -131,6 +138,349 @@ class NewCondTest(test.TestCase):
         self.assertIn("foo_cond_1_true", ops.get_default_graph()._functions)
         self.assertIn("foo_cond_1_false", ops.get_default_graph()._functions)
 
+  def testDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+
+      @function.Defun()
+      def fn():
+        return x * y * 2.0
+
+      return fn()
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testNestedDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return 2.0
+
+    def false_fn():
+
+      @function.Defun()
+      def fn():
+
+        @function.Defun()
+        def nested_fn():
+          return x * y * 2.0
+
+        return nested_fn()
+
+      return fn()
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testDoubleNestedDefunInCond(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+
+      @function.Defun()
+      def fn():
+
+        @function.Defun()
+        def nested_fn():
+
+          @function.Defun()
+          def nested_nested_fn():
+            return x * y * 2.0
+
+          return nested_nested_fn()
+
+        return nested_fn()
+
+      return fn()
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testNestedCond(self):
+
+    def run_test(pred_value):
+
+      def build_graph():
+        pred = array_ops.placeholder(dtypes.bool, name="pred")
+        x = constant_op.constant(1.0, name="x")
+        y = constant_op.constant(2.0, name="y")
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def false_true_fn():
+            return x * y * 2.0
+
+          def false_false_fn():
+            return x * 5.0
+
+          return _cond(pred, false_true_fn, false_false_fn, "inside_false_fn")
+
+        return x, y, pred, true_fn, false_fn
+
+      with ops.Graph().as_default():
+        x, y, pred, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x, y], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [x], {pred: pred_value})
+        self._testCond(true_fn, false_fn, [y], {pred: pred_value})
+
+    run_test(True)
+    run_test(False)
+
+  def testDoubleNestedCond(self):
+
+    def run_test(pred1_value, pred2_value):
+
+      def build_graph():
+        pred1 = array_ops.placeholder(dtypes.bool, name="pred1")
+        pred2 = array_ops.placeholder(dtypes.bool, name="pred2")
+        x = constant_op.constant(1.0, name="x")
+        y = constant_op.constant(2.0, name="y")
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def false_true_fn():
+
+            def false_true_true_fn():
+              return x * y * 2.0
+
+            def false_true_false_fn():
+              return x * 10.0
+
+            return _cond(
+                pred1,
+                false_true_true_fn,
+                false_true_false_fn,
+                name="inside_false_true_fn")
+
+          def false_false_fn():
+            return x * 5.0
+
+          return _cond(
+              pred2, false_true_fn, false_false_fn, name="inside_false_fn")
+
+        return x, y, pred1, pred2, true_fn, false_fn
+
+      with ops.Graph().as_default():
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x, y], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [x], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+        x, y, pred1, pred2, true_fn, false_fn = build_graph()
+        self._testCond(true_fn, false_fn, [y], {
+            pred1: pred1_value,
+            pred2: pred2_value
+        })
+
+    run_test(True, True)
+    run_test(True, False)
+    run_test(False, False)
+    run_test(False, True)
+
+  def testGradientFromInsideDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      def true_fn():
+        return 2.0
+
+      def false_fn():
+
+        def inner_true_fn():
+          return x * y * 2.0
+
+        def inner_false_fn():
+          return x * 5.0
+
+        return cond_v2.cond_v2(
+            pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+      cond_outer = cond_v2.cond_v2(
+          pred_outer, true_fn, false_fn, name="outer_cond")
+
+      # Compute grads inside a Defun.
+      @function.Defun()
+      def nesting_fn():
+        return gradients_impl.gradients(cond_outer, [x, y])
+
+      grads = nesting_fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
+  def testGradientFromInsideNestedDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      def true_fn():
+        return 2.0
+
+      def false_fn():
+
+        def inner_true_fn():
+          return x * y * 2.0
+
+        def inner_false_fn():
+          return x * 5.0
+
+        return cond_v2.cond_v2(
+            pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+      cond_outer = cond_v2.cond_v2(
+          pred_outer, true_fn, false_fn, name="outer_cond")
+
+      # Compute grads inside a Defun.
+      @function.Defun()
+      def nesting_fn():
+
+        @function.Defun()
+        def inner_nesting_fn():
+          return gradients_impl.gradients(cond_outer, [x, y])
+
+        return inner_nesting_fn()
+
+      grads = nesting_fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
+  def testBuildCondAndGradientInsideDefun(self):
+
+    def build_graph():
+      pred_outer = array_ops.placeholder(dtypes.bool, name="pred_outer")
+      pred_inner = array_ops.placeholder(dtypes.bool, name="pred_inner")
+      x = constant_op.constant(1.0, name="x")
+      y = constant_op.constant(2.0, name="y")
+
+      # Build cond and its gradient inside a Defun.
+      @function.Defun()
+      def fn():
+
+        def true_fn():
+          return 2.0
+
+        def false_fn():
+
+          def inner_true_fn():
+            return x * y * 2.0
+
+          def inner_false_fn():
+            return x * 5.0
+
+          return cond_v2.cond_v2(
+              pred_inner, inner_true_fn, inner_false_fn, name="inner_cond")
+
+        cond_outer = cond_v2.cond_v2(
+            pred_outer, true_fn, false_fn, name="outer_cond")
+        return gradients_impl.gradients(cond_outer, [x, y])
+
+      grads = fn()
+
+      return grads, pred_outer, pred_inner
+
+    with ops.Graph().as_default():
+      grads, pred_outer, pred_inner = build_graph()
+      with self.test_session(graph=ops.get_default_graph()) as sess:
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: True
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: True,
+                pred_inner: False
+            }), [0., 0.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: True
+            }), [4., 2.])
+        self.assertSequenceEqual(
+            sess.run(grads, {
+                pred_outer: False,
+                pred_inner: False
+            }), [5., 0.])
+
   def testSecondDerivative(self):
     with self.test_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -532,5 +882,17 @@ class CondV2ColocationGroupAndDeviceTest(test.TestCase):
         self.assertTrue(len(run_metadata.partition_graphs) >= 2)
 
 
+def _cond(pred, true_fn, false_fn, name):
+  if _is_old_cond():
+    return control_flow_ops.cond(pred, true_fn, false_fn, name=name)
+  else:
+    return cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
+
+
+def _is_old_cond():
+  return isinstance(ops.get_default_graph()._get_control_flow_context(),
+                    control_flow_ops.CondContext)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 7134e02c348b47048cff5b0c205d1dd613c31a81..58845552db5e22dd4e5e9a6de09de023c58be512 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -90,7 +90,7 @@ def CheckGradConfigsToTest():
 class DepthwiseConv2DTest(test.TestCase):
 
   # This is testing that depthwise_conv2d and depthwise_conv2d_native
-  # produce the same results.  It also tests that NCHW and NWHC
+  # produce the same results.  It also tests that NCHW and NHWC
   # formats agree, by comparing the depthwise_conv2d_native with
   # 'NCHW' format (with transposition) matches the 'NHWC' format using
   # the higher level interface.
@@ -142,7 +142,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_t1 = t1
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
@@ -368,7 +368,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_input = input_tensor
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_input = array_ops.transpose(input_tensor, [0, 3, 1, 2])
         input_shape = [
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 9d38ffcb4a963efb71153f59d6269ba84a5d1379..61faa8466edcf404dc48fc0596c47cb3c2094f13 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -311,8 +311,10 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testUnsupportedDtype(self):
     with self.test_session():
+      param = ops.convert_to_tensor(
+          np.ones([2**11 + 1]).astype(dtypes.qint16.as_numpy_dtype),
+          dtype=dtypes.qint16)
       with self.assertRaises(TypeError):
-        param = array_ops.ones([int(2**11+1)], dtype=dtypes.qint16)
         du.embed_check_categorical_event_shape(param)
 
 
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 58e2a8ac2a3b827647b1b1176f4b69e6a88b76c6..c0b419e1d13405d04c34fb642cec15760ddcf50f 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -203,8 +203,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
+          r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
         gather_nd.eval()
 
   def _disabledTestBadIndicesGPU(self):
@@ -217,8 +216,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [7]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[1, :\] = \[7\] does not index into param "
-          r"\(shape: \[3\]\)"):
+          r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
         gather_nd.eval()
 
   def testBadIndicesWithSlicesCPU(self):
@@ -227,8 +225,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
+          r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
         gather_nd.eval()
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
@@ -241,8 +238,7 @@ class GatherNdTest(test.TestCase):
       indices = [[[0], [0], [1]]]  # Make this one higher rank
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
-          r"flat indices\[2, :\] = \[1\] does not index into param "
-          r"\(shape: \[1,3\]\)"):
+          r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
         gather_nd.eval()
 
   def testGradientsRank2Elements(self):
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 487418e6943ae72cce2e0ed5eee7ecf227fecd3a..f4ec3e3996a17405b65d240534d2f2d47973d418 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -234,3 +234,21 @@ cuda_py_test(
         "optonly",
     ],
 )
+
+cuda_py_test(
+    name = "linear_operator_zeros_test",
+    size = "medium",
+    srcs = ["linear_operator_zeros_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:random_ops",
+    ],
+    shard_count = 5,
+    tags = ["optonly"],  # Test is flaky without optimization.
+)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f60b55e0ad416ea1f09996f633e09a7dc2c3741
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -0,0 +1,192 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import random_seed
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+
+random_seed.set_random_seed(23)
+rng = np.random.RandomState(2016)
+
+
+class LinearOperatorZerosTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  @property
+  def _tests_to_skip(self):
+    return ["log_abs_det", "solve", "solve_with_broadcast"]
+
+  @property
+  def _operator_build_infos(self):
+    build_info = linear_operator_test_util.OperatorBuildInfo
+    return [
+        build_info((1, 1)),
+        build_info((1, 3, 3)),
+        build_info((3, 4, 4)),
+        build_info((2, 1, 4, 4))]
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    del use_placeholder
+    shape = list(build_info.shape)
+    assert shape[-1] == shape[-2]
+
+    batch_shape = shape[:-2]
+    num_rows = shape[-1]
+
+    operator = linalg_lib.LinearOperatorZeros(
+        num_rows, batch_shape=batch_shape, dtype=dtype)
+    matrix = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return operator, matrix
+
+  def test_assert_positive_definite(self):
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    with self.assertRaisesOpError("non-positive definite"):
+      operator.assert_positive_definite()
+
+  def test_assert_non_singular(self):
+    with self.assertRaisesOpError("non-invertible"):
+      operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+      operator.assert_non_singular()
+
+  def test_assert_self_adjoint(self):
+    with self.test_session():
+      operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+      operator.assert_self_adjoint().run()  # Should not fail
+
+  def test_non_scalar_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+      linalg_lib.LinearOperatorZeros(num_rows=[2])
+    with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=[2])
+
+  def test_non_integer_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2.)
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=2.)
+
+  def test_negative_num_rows_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=-2)
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, num_columns=-2)
+
+  def test_non_1d_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be a 1-D"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=2)
+
+  def test_non_integer_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(TypeError, "must be integer"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[2.])
+
+  def test_negative_batch_shape_raises_static(self):
+    with self.assertRaisesRegexp(ValueError, "must be non-negative"):
+      linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
+
+  def test_non_scalar_num_rows_raises_dynamic(self):
+    with self.test_session():
+      num_rows = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be a 0-D Tensor"):
+        operator.to_dense().eval(feed_dict={num_rows: [2]})
+
+  def test_negative_num_rows_raises_dynamic(self):
+    with self.test_session():
+      n = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=n, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={n: -2})
+
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, num_columns=n, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={n: -2})
+
+  def test_non_1d_batch_shape_raises_dynamic(self):
+    with self.test_session():
+      batch_shape = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be a 1-D"):
+        operator.to_dense().eval(feed_dict={batch_shape: 2})
+
+  def test_negative_batch_shape_raises_dynamic(self):
+    with self.test_session():
+      batch_shape = array_ops.placeholder(dtypes.int32)
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True)
+      with self.assertRaisesOpError("must be non-negative"):
+        operator.to_dense().eval(feed_dict={batch_shape: [-2]})
+
+  def test_wrong_matrix_dimensions_raises_static(self):
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    x = rng.randn(3, 3).astype(np.float32)
+    with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
+      operator.matmul(x)
+
+  def test_wrong_matrix_dimensions_raises_dynamic(self):
+    num_rows = array_ops.placeholder(dtypes.int32)
+    x = array_ops.placeholder(dtypes.float32)
+
+    with self.test_session():
+      operator = linalg_lib.LinearOperatorZeros(
+          num_rows, assert_proper_shapes=True)
+      y = operator.matmul(x)
+      with self.assertRaisesOpError("Incompatible.*dimensions"):
+        y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)})
+
+  def test_is_x_flags(self):
+    # The is_x flags are by default all True.
+    operator = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertFalse(operator.is_positive_definite)
+    self.assertFalse(operator.is_non_singular)
+    self.assertTrue(operator.is_self_adjoint)
+
+
+class LinearOperatorZerosNotSquareTest(
+    linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
+
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+    del use_placeholder
+    shape = list(build_info.shape)
+
+    batch_shape = shape[:-2]
+    num_rows = shape[-2]
+    num_columns = shape[-1]
+
+    operator = linalg_lib.LinearOperatorZeros(
+        num_rows, num_columns, is_square=False, is_self_adjoint=False,
+        batch_shape=batch_shape, dtype=dtype)
+    matrix = array_ops.zeros(shape=shape, dtype=dtype)
+
+    return operator, matrix
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index d8ce9fffbd2bc0d18033339a02e0ad84f8f4c952..3cbbd48c8cb26d5cdb457c9599bfc9131000d174 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -82,7 +82,7 @@ def CheckGradConfigsToTest():
 class DepthwiseConv2DTest(test.TestCase):
 
   # This is testing that depthwise_conv2d and depthwise_conv2d_native
-  # produce the same results.  It also tests that NCHW and NWHC
+  # produce the same results.  It also tests that NCHW and NHWC
   # formats agree, by comparing the depthwise_conv2d_native with
   # 'NCHW' format (with transposition) matches the 'NHWC' format using
   # the higher level interface.
@@ -123,7 +123,7 @@ class DepthwiseConv2DTest(test.TestCase):
       native_t1 = t1
       strides = [1, stride, stride, 1]
       if data_format == "NCHW":
-        # Transpose from NWHC input to NCHW
+        # Transpose from NHWC input to NCHW
         # Ex. [4, 5, 5, 48] to [4, 48, 5, 5]
         native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index f9b9c77bbf7e2a8afdbfbd0929a68856b8aae51c..f2f30234696be7f6c8c98d041bc415ccf5cb4ecf 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -268,12 +268,12 @@ class StatefulScatterNdTest(test.TestCase):
         # Test some out of range errors.
         indices = np.array([[-1], [0], [5]])
         with self.assertRaisesOpError(
-            r"Invalid indices: \[0,0\] = \[-1\] does not index into \[6\]"):
+            r"indices\[0\] = \[-1\] does not index into shape \[6\]"):
           op(ref, indices, updates).eval()
 
         indices = np.array([[2], [0], [6]])
         with self.assertRaisesOpError(
-            r"Invalid indices: \[2,0\] = \[6\] does not index into \[6\]"):
+            r"indices\[2\] = \[6\] does not index into shape \[6\]"):
           op(ref, indices, updates).eval()
 
   def testRank3ValidShape(self):
@@ -369,6 +369,29 @@ class ScatterNdTest(test.TestCase):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBool(self):
+    indices = constant_op.constant(
+        [[4], [3], [1], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(
+        [False, True, False, True], dtype=dtypes.bool)
+    expected = np.array(
+        [False, False, False, True, False, False, False, True])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    result = self.evaluate(scatter)
+    self.assertAllEqual(expected, result)
+
+    # Same indice is updated twice by same value.
+    indices = constant_op.constant(
+        [[4], [3], [3], [7]], dtype=dtypes.int32)
+    updates = constant_op.constant(
+        [False, True, True, True], dtype=dtypes.bool)
+    expected = np.array([
+        False, False, False, True, False, False, False, True])
+    scatter = self.scatter_nd(indices, updates, shape=(8,))
+    result = self.evaluate(scatter)
+    self.assertAllEqual(expected, result)
+
   @test_util.run_in_graph_and_eager_modes
   def testInvalidShape(self):
     # TODO(apassos) figure out how to unify these errors
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index fe459a96b98733f8a706b0c3b84000c5a74894ad..a2b5f77f915404b124b14c5d34996cd85cb2b468 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -790,7 +790,7 @@ def _ExtractImagePatchesGrad(op, grad):
 
   sp_mat = sparse_tensor.SparseTensor(
       array_ops.constant(idx, dtype=ops.dtypes.int64),
-      array_ops.ones((len(idx),), dtype=ops.dtypes.float32), sp_shape)
+      array_ops.ones((len(idx),), dtype=grad.dtype), sp_shape)
 
   jac = sparse_ops.sparse_tensor_dense_matmul(sp_mat, grad_flat)
 
diff --git a/tensorflow/python/ops/boosted_trees_ops.py b/tensorflow/python/ops/boosted_trees_ops.py
index 868a4f6b84df2c0d1b8b55a254f16f1be5ee1f1d..f7cbfe0312fe5e7d8d75580af9f362236fd5b79d 100644
--- a/tensorflow/python/ops/boosted_trees_ops.py
+++ b/tensorflow/python/ops/boosted_trees_ops.py
@@ -37,8 +37,19 @@ from tensorflow.python.training import saver
 
 
 class PruningMode(object):
+  """Class for working with Pruning modes."""
   NO_PRUNING, PRE_PRUNING, POST_PRUNING = range(0, 3)
 
+  _map = {'none': NO_PRUNING, 'pre': PRE_PRUNING, 'post': POST_PRUNING}
+
+  @classmethod
+  def from_str(cls, mode):
+    if mode in cls._map:
+      return cls._map[mode]
+    else:
+      raise ValueError('pruning_mode mode must be one of: {}'.format(', '.join(
+          sorted(cls._map))))
+
 
 class _TreeEnsembleSavable(saver.BaseSaverBuilder.SaveableObject):
   """SaveableObject implementation for TreeEnsemble."""
diff --git a/tensorflow/python/ops/cond_v2_impl.py b/tensorflow/python/ops/cond_v2_impl.py
index d310f83dca97889157eb078b11a3ca51caae2fc2..44c5c050c08240dd13766b71e8e708c9b0317399 100644
--- a/tensorflow/python/ops/cond_v2_impl.py
+++ b/tensorflow/python/ops/cond_v2_impl.py
@@ -58,12 +58,14 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 
   with ops.name_scope(name) as scope:
     # Identify if there is a caller device, & get the innermost if possible.
-    device_stack = ops.get_default_graph()._device_function_stack
-    caller_device = device_stack[-1] if device_stack else None
+    # pylint: disable=protected-access
+    device_funcs = ops.get_default_graph()._device_functions_outer_to_inner
+    caller_device = device_funcs[-1] if device_funcs else None
 
     caller_colocation_stack = ops.get_default_graph()._colocation_stack
     caller_container = ops.get_default_graph()._container
     caller_collection_ref = ops.get_default_graph()._collections
+    # pylint: enable=protected-access
 
     func_name_prefix = scope.replace("/", "_")
 
@@ -106,7 +108,7 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     false_graph.outputs.extend(extra_false_outputs)
 
     # Create the If op.
-    tensors = gen_functional_ops._if(
+    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
         pred, cond_inputs, [t.dtype for t in true_graph.outputs],
         _create_new_tf_function(true_graph),
         _create_new_tf_function(false_graph),
@@ -125,8 +127,10 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
     # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
     if_op = tensors[0].op
     if not control_flow_util.IsInXLAContext(if_op):
+      # pylint: disable=protected-access
       if_op._set_attr("_lower_using_switch_merge",
                       attr_value_pb2.AttrValue(b=True))
+      # pylint: enable=protected-access
 
     return tensors[:num_cond_outputs]
 
@@ -135,6 +139,10 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
 def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of an If op produced by cond_v2."""
   true_graph, false_graph = _get_func_graphs(op)
+  # Note: op.graph != ops.get_default_graph() when we are computing the gradient
+  # of a nested cond.
+  assert true_graph._outer_graph == op.graph
+  assert false_graph._outer_graph == op.graph
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -147,15 +155,16 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   assert ([t.dtype for t in true_grad_graph.outputs] ==
           [t.dtype for t in false_grad_graph.outputs])
 
-  # Match up the captured grad function inputs with outputs of 'op' and other
-  # external tensors.
-  true_grad_inputs = _get_grad_inputs(op, true_graph, true_grad_graph)
-  false_grad_inputs = _get_grad_inputs(op, false_graph, false_grad_graph)
+  # Resolve references to forward graph tensors in grad graphs and ensure
+  # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
+  true_grad_extra_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
+  false_grad_extra_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
   # Make the inputs to true_grad_graph and false_grad_graph match. Note that
   # this modifies true_grad_graph and false_grad_graph.
   grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_inputs, false_grad_inputs)
+                                   true_grad_extra_inputs,
+                                   false_grad_extra_inputs)
 
   # Add all intermediate tensors as function outputs so they're available for
   # higher-order gradient computations.
@@ -199,11 +208,20 @@ def _get_func_graphs(if_op):
     input_shapes = [t.shape for t in extra_inputs]
     func_name = if_op.get_attr(branch_name).name
     fdef = if_op.graph._get_function(func_name).definition
-    func_graph = _function_def_to_graph.function_def_to_graph(
-        fdef, input_shapes)
+    # `if_op.graph` may not be the same as `ops.get_default_graph()` e.g.
+    # in the case of nested if ops or when the gradient is being computed
+    # from inside a Defun. We build the `func_graph` with `if_op.graph` as its
+    # `outer_graph`. This resembles how the `_FuncGraph` was built in the
+    # forward pass. We need this so that we can resolve references to tensors
+    # in `func_graph` from its gradient graph in `_resolve_grad_inputs`.
+    with if_op.graph.as_default():
+      func_graph = _function_def_to_graph.function_def_to_graph(
+          fdef, input_shapes)
     func_graph.extra_inputs = extra_inputs
     func_graph.extra_args = func_graph.inputs
     func_graph._captured = dict(zip(extra_inputs, func_graph.inputs))
+    # Set the if op so that the gradient code can use it.
+    func_graph._if = if_op
     return func_graph
 
   return (_get_func_graph_for_branch("then_branch"),
@@ -240,7 +258,7 @@ def _grad_fn(func_graph, grads):
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _get_grad_inputs.
+  # in _resolve_grad_inputs.
   result = _gradients_impl._GradientsHelper(
       ys, func_graph.inputs, grad_ys=grad_ys,
       src_graph=func_graph)
@@ -261,43 +279,49 @@ def _create_grad_func(func_graph, grads, name):
                                            [], [], name)
 
 
-def _get_grad_inputs(if_op, cond_graph, grad_graph):
-  """Returns the tensors we should pass to grad_graph.
+def _resolve_grad_inputs(cond_graph, grad_graph):
+  """Returns the tensors to pass as `extra_inputs` to `grad_graph`.
 
-  This method handles tensors captured from cond_graph in grad_graph. It
-  converts these to suitable input tensors from the outer graph.
+  The `grad_graph` may have external references to
+  1. Its outer graph containing the input gradients. These references are kept
+     as is.
+  2. Tensors in the forward pass graph. These tensors may not be "live"
+     when the gradient is being computed. We replace such references by their
+     corresponding tensor in the least common ancestor graph of `grad_graph` and
+     `cond_graph`. Since we export intermediate tensors for all branch
+     functions, this is always possible.
 
   Args:
-    if_op: Operation. The forward-pass If op that uses cond_graph.
     cond_graph: function._FuncGraph. The forward-pass function.
     grad_graph: function._FuncGraph. The gradients function.
 
   Returns:
     A list of inputs tensors to be passed to grad_graph.
   """
-  inputs = []
-
-  # Maps placeholders in cond_graph -> input tensor in outer graph.
-  forward_input_map = {v: k for k, v in cond_graph._captured.items()}
+  new_extra_inputs = []
 
   for t in grad_graph.extra_inputs:
-    if t.graph == ops.get_default_graph():
-      # t is in the outer graph (e.g. one of the input gradients).
-      inputs.append(t)
-    elif t in forward_input_map:
-      # t is an input placeholder in cond_graph. Get the corresponding input
-      # tensor in the outer graph.
-      assert t.graph == cond_graph
-      assert forward_input_map[t].graph == ops.get_default_graph()
-      inputs.append(forward_input_map[t])
-    else:
-      # t is an intermediate value in cond_graph. Get the corresponding output
-      # of 'if_op' (note that all intermediate values are outputs).
-      assert t.graph == cond_graph
-      output_idx = cond_graph.outputs.index(t)
-      inputs.append(if_op.outputs[output_idx])
-
-  return inputs
+    if t.graph != grad_graph._outer_graph:
+      # `t` is a tensor in `cond_graph` or one of its ancestors. We bubble this
+      # tensor to the least common ancestor of the `cond_graph` and
+      # `grad_graph` so that it is "in-scope" for `grad_graph`.
+      # TODO(srbs): `_is_ancestor` calls may be expensive. Compute the least
+      # common ancestor once and re-use.
+      assert _is_ancestor(cond_graph, t.graph)
+      while not _is_ancestor(grad_graph, t.graph):
+        assert isinstance(t.graph, _function._FuncGraph)
+        if t in t.graph.extra_args:
+          # TODO(srbs): Consider building a map of extra_args -> extra_inputs.
+          # instead of searching for `t` twice.
+          t = t.graph.extra_inputs[t.graph.extra_args.index(t)]
+        else:
+          # Note: All intermediate tensors are output by the If op.
+          # TODO(srbs): .index() calls may be expensive. Optimize.
+          t = t.graph._if.outputs[t.graph.outputs.index(t)]
+      assert _is_ancestor(grad_graph, t.graph)
+    new_extra_inputs.append(t)
+
+  return new_extra_inputs
 
 
 def _create_new_tf_function(func_graph):
@@ -326,7 +350,8 @@ def _create_new_tf_function(func_graph):
   # a new TF_Function that we add to the graph.
   fdef = _function.function_def_from_tf_function(c_func)
   defined_func = _function._from_definition(fdef)
-  defined_func.add_to_graph(ops.get_default_graph())
+  defined_func._sub_functions = func_graph._functions
+  defined_func.add_to_graph(func_graph._outer_graph)
 
   return func_graph.name
 
@@ -389,7 +414,8 @@ def _pad_params(true_graph, false_graph, true_params, false_params):
   return new_true_params, new_false_inputs
 
 
-def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
+def _make_inputs_match(true_graph, false_graph, true_extra_inputs,
+                       false_extra_inputs):
   """Modifies true_graph and false_graph so they have the same input signature.
 
   This method reorders and/or adds parameters to true_graph and false_graph so
@@ -400,9 +426,9 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   Args:
     true_graph: function._FuncGraph
     false_graph: function._FuncGraph
-    true_inputs: a list of Tensors in the outer graph. The inputs for
+    true_extra_inputs: a list of Tensors in the outer graph. The inputs for
       true_graph.
-    false_inputs: a list of Tensors in the outer graph. The inputs for
+    false_extra_inputs: a list of Tensors in the outer graph. The inputs for
       false_graph.
 
   Returns:
@@ -411,12 +437,12 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
     false_inputs.
   """
   shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
-      true_inputs, false_inputs)
+      true_extra_inputs, false_extra_inputs)
 
   new_inputs = shared_inputs + true_only_inputs + false_only_inputs
 
-  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
-  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
+  true_input_to_param = dict(zip(true_extra_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_extra_inputs, false_graph.inputs))
 
   true_graph.inputs = (
       [true_input_to_param[t] for t in shared_inputs] +
@@ -432,6 +458,9 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   true_graph.extra_inputs = new_inputs
   false_graph.extra_inputs = new_inputs
 
+  true_graph.extra_args = true_graph.inputs
+  false_graph.extra_args = false_graph.inputs
+
   true_graph._captured = dict(zip(new_inputs, true_graph.inputs))
   false_graph._captured = dict(zip(new_inputs, false_graph.inputs))
 
@@ -454,14 +483,30 @@ def _create_dummy_params(func_graph, template_tensors):
 
 
 def _get_grad_fn_name(func_graph):
-  """Returns a unique name to use for the grad function of `func_graph`."""
+  """Returns a unique name to use for the grad function of `func_graph`.
+
+  Ensures this name is unique in the entire hierarchy.
+
+  Args:
+    func_graph: The _FuncGraph.
+
+  Returns:
+    A string, the name to use for the gradient function.
+  """
   name = "%s_grad" % func_graph.name
 
   base_name = name
   counter = 1
-  if ops.get_default_graph()._is_function(name):
-    name = "%s_%s" % (base_name, counter)
-    counter += 1
+  has_conflict = True
+  while has_conflict:
+    curr_graph = func_graph._outer_graph
+    has_conflict = curr_graph._is_function(name)
+    while not has_conflict and isinstance(curr_graph, _function._FuncGraph):
+      curr_graph = curr_graph._outer_graph
+      has_conflict = curr_graph._is_function(name)
+    if has_conflict:
+      name = "%s_%s" % (base_name, counter)
+      counter += 1
 
   return name
 
@@ -477,3 +522,11 @@ def _check_same_outputs(true_graph, false_graph):
         "arguments, got:\n"
         "  true_fn: %s\n"
         "  false_fn: %s" % (true_output_types, false_output_types))
+
+
+def _is_ancestor(graph, maybe_ancestor):
+  if maybe_ancestor == graph:
+    return True
+  if isinstance(graph, _function._FuncGraph):
+    return _is_ancestor(graph._outer_graph, maybe_ancestor)
+  return False
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 888075ba2ecbebfdee08841cfbbfb05d2582b0f7..aeac61c005ab5dae0a3e467ca89ee9026e26eec0 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -1817,15 +1817,34 @@ class CondContext(ControlFlowContext):
   def _AddOpInternal(self, op):
     """Add `op` to the current context."""
     if not op.inputs:
-      # Remove any external control dependency on this op
+      # If we're in a while loop, remove any control inputs from outside the
+      # loop.
       self._RemoveExternalControlEdges(op)
-      # pylint: disable=protected-access
-      op._add_control_input(self._pivot.op)
-      # pylint: enable=protected-access
+
+      if not any(util.OpInContext(input_op, self)
+                 for input_op in op.control_inputs):
+        # pylint: disable=protected-access
+        op._add_control_input(self._pivot.op)
+        # pylint: enable=protected-access
     else:
+      # Make each input to 'op' available in this CondContext. If an input is
+      # already part of this context there's nothing to do, but if it's
+      # external, AddValue() will handle adding the appropriate Switch node and
+      # other bookkeeping.
       for index in range(len(op.inputs)):
         x = op.inputs[index]
-        real_x = self.AddValue(x)
+        if op.type == "Merge" and x.op.type == "NextIteration":
+          # Edge case: if we're importing a while loop inside this CondContext,
+          # AddValue() will not correctly handle the NextIteration inputs to
+          # Merge node. The problem is that the NextIteration should also be
+          # part of this context, but if we're importing it won't have been
+          # processed and added to the context yet, so AddValue() will try to
+          # add a Switch which results in an invalid graph. Instead, we use the
+          # NextIteration input as-is here, and it will eventually be added to
+          # the context via AddOp().
+          real_x = x
+        else:
+          real_x = self.AddValue(x)
         if real_x != x:
           # pylint: disable=protected-access
           op._update_input(index, real_x)
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 7a18986c5b03446d2b0e0a2ecd161dccdc3d70e1..72c074ed1af208da274edd52572961ecaa613b34 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -214,6 +214,14 @@ def IsContainingContext(ctxt, maybe_containing_ctxt):
   return True
 
 
+def OpInContext(op, ctxt):
+  return IsContainingContext(op._get_control_flow_context(), ctxt)  # pylint: disable=protected-access
+
+
+def TensorInContext(tensor, ctxt):
+  return OpInContext(tensor.op, ctxt)
+
+
 def CheckInputFromValidContext(op, input_op):
   """Returns whether `input_op` can be used from `op`s context.
 
diff --git a/tensorflow/python/ops/conv2d_benchmark.py b/tensorflow/python/ops/conv2d_benchmark.py
index aacdaa7ad019d8aae2d0b533cde8412ab0f0fa22..28111c273059bca3c4cc643b4aa826f9be402308 100644
--- a/tensorflow/python/ops/conv2d_benchmark.py
+++ b/tensorflow/python/ops/conv2d_benchmark.py
@@ -175,7 +175,8 @@ class Conv2DBenchmark(test.Benchmark):
 
     data_types = [dtypes.float32, dtypes.float16]
     data_formats = ["NHWC", "NCHW"]
-    in_channels = list(range(3, 16))
+    in_channels = list(range(1, 10)) + list(range(10, 20, 2)) + list(
+        range(20, 33, 4))
     out_channels = [4, 16, 32]
     hw_strides = [[2, 2]]
     paddings = ["VALID", "SAME"]
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index a7ba0bbe9cbc4be9daea79cc97eaac4c21523c04..c29b5033bb137e8376e1c19985755b4fc72e8834 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops.linalg.linear_operator_identity import *
 from tensorflow.python.ops.linalg.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator_low_rank_update import *
 from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
+from tensorflow.python.ops.linalg.linear_operator_zeros import *
 # pylint: enable=wildcard-import
 
 # Seal API.
diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a79c065b32f452cfbb49c6bbd485556cc79445
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py
@@ -0,0 +1,452 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""`LinearOperator` acting like a zero matrix."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "LinearOperatorZeros",
+]
+
+
+@tf_export("linalg.LinearOperatorZeros")
+class LinearOperatorZeros(linear_operator.LinearOperator):
+  """`LinearOperator` acting like a [batch] zero matrix.
+
+  This operator acts like a [batch] zero matrix `A` with shape
+  `[B1,...,Bb, N, M]` for some `b >= 0`.  The first `b` indices index a
+  batch member.  For every batch index `(i1,...,ib)`, `A[i1,...,ib, : :]` is
+  an `N x M` matrix.  This matrix `A` is not materialized, but for
+  purposes of broadcasting this shape will be relevant.
+
+  `LinearOperatorZeros` is initialized with `num_rows`, and optionally
+  `num_columns, `batch_shape`, and `dtype` arguments.  If `num_columns` is
+  `None`, then this operator will be initialized as a square matrix. If
+  `batch_shape` is `None`, this operator efficiently passes through all
+  arguments.  If `batch_shape` is provided, broadcasting may occur, which will
+  require making copies.
+
+  ```python
+  # Create a 2 x 2 zero matrix.
+  operator = LinearOperatorZero(num_rows=2, dtype=tf.float32)
+
+  operator.to_dense()
+  ==> [[0., 0.]
+       [0., 0.]]
+
+  operator.shape
+  ==> [2, 2]
+
+  operator.determinant()
+  ==> 0.
+
+  x = ... Shape [2, 4] Tensor
+  operator.matmul(x)
+  ==> Shape [2, 4] Tensor, same as x.
+
+  # Create a 2-batch of 2x2 zero matrices
+  operator = LinearOperatorZeros(num_rows=2, batch_shape=[2])
+  operator.to_dense()
+  ==> [[[0., 0.]
+        [0., 0.]],
+       [[0., 0.]
+        [0., 0.]]]
+
+  # Here, even though the operator has a batch shape, the input is the same as
+  # the output, so x can be passed through without a copy.  The operator is able
+  # to detect that no broadcast is necessary because both x and the operator
+  # have statically defined shape.
+  x = ... Shape [2, 2, 3]
+  operator.matmul(x)
+  ==> Shape [2, 2, 3] Tensor, same as tf.zeros_like(x)
+
+  # Here the operator and x have different batch_shape, and are broadcast.
+  # This requires a copy, since the output is different size than the input.
+  x = ... Shape [1, 2, 3]
+  operator.matmul(x)
+  ==> Shape [2, 2, 3] Tensor, equal to tf.zeros_like([x, x])
+  ```
+
+  ### Shape compatibility
+
+  This operator acts on [batch] matrix with compatible shape.
+  `x` is a batch matrix with compatible shape for `matmul` and `solve` if
+
+  ```
+  operator.shape = [B1,...,Bb] + [N, M],  with b >= 0
+  x.shape =   [C1,...,Cc] + [M, R],
+  and [C1,...,Cc] broadcasts with [B1,...,Bb] to [D1,...,Dd]
+  ```
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               num_rows,
+               num_columns=None,
+               batch_shape=None,
+               dtype=None,
+               is_non_singular=False,
+               is_self_adjoint=True,
+               is_positive_definite=False,
+               is_square=True,
+               assert_proper_shapes=False,
+               name="LinearOperatorZeros"):
+    r"""Initialize a `LinearOperatorZeros`.
+
+    The `LinearOperatorZeros` is initialized with arguments defining `dtype`
+    and shape.
+
+    This operator is able to broadcast the leading (batch) dimensions, which
+    sometimes requires copying data.  If `batch_shape` is `None`, the operator
+    can take arguments of any batch shape without copying.  See examples.
+
+    Args:
+      num_rows:  Scalar non-negative integer `Tensor`.  Number of rows in the
+        corresponding zero matrix.
+      num_columns:  Scalar non-negative integer `Tensor`.  Number of columns in
+        the corresponding zero matrix. If `None`, defaults to the value of
+        `num_rows`.
+      batch_shape:  Optional `1-D` integer `Tensor`.  The shape of the leading
+        dimensions.  If `None`, this operator has no leading dimensions.
+      dtype:  Data type of the matrix that this operator represents.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      assert_proper_shapes:  Python `bool`.  If `False`, only perform static
+        checks that initialization and method arguments have proper shape.
+        If `True`, and static checks are inconclusive, add asserts to the graph.
+      name: A name for this `LinearOperator`
+
+    Raises:
+      ValueError:  If `num_rows` is determined statically to be non-scalar, or
+        negative.
+      ValueError:  If `num_columns` is determined statically to be non-scalar,
+        or negative.
+      ValueError:  If `batch_shape` is determined statically to not be 1-D, or
+        negative.
+      ValueError:  If any of the following is not `True`:
+        `{is_self_adjoint, is_non_singular, is_positive_definite}`.
+    """
+    dtype = dtype or dtypes.float32
+    self._assert_proper_shapes = assert_proper_shapes
+
+    with ops.name_scope(name):
+      dtype = dtypes.as_dtype(dtype)
+      if not is_self_adjoint and is_square:
+        raise ValueError("A zero operator is always self adjoint.")
+      if is_non_singular:
+        raise ValueError("A zero operator is always singular.")
+      if is_positive_definite:
+        raise ValueError("A zero operator is always not positive-definite.")
+
+      super(LinearOperatorZeros, self).__init__(
+          dtype=dtype,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+      self._num_rows = linear_operator_util.shape_tensor(
+          num_rows, name="num_rows")
+      self._num_rows_static = tensor_util.constant_value(self._num_rows)
+
+      if num_columns is None:
+        num_columns = num_rows
+
+      self._num_columns = linear_operator_util.shape_tensor(
+          num_columns, name="num_columns")
+      self._num_columns_static = tensor_util.constant_value(self._num_columns)
+
+      self._check_domain_range_possibly_add_asserts()
+
+      if (self._num_rows_static is not None and
+          self._num_columns_static is not None):
+        if is_square and self._num_rows_static != self._num_columns_static:
+          raise ValueError(
+              "LinearOperatorZeros initialized as is_square=True, but got "
+              "num_rows({}) != num_columns({})".format(
+                  self._num_rows_static,
+                  self._num_columns_static))
+
+      if batch_shape is None:
+        self._batch_shape_arg = None
+      else:
+        self._batch_shape_arg = linear_operator_util.shape_tensor(
+            batch_shape, name="batch_shape_arg")
+        self._batch_shape_static = tensor_util.constant_value(
+            self._batch_shape_arg)
+        self._check_batch_shape_possibly_add_asserts()
+
+  def _shape(self):
+    matrix_shape = tensor_shape.TensorShape((self._num_rows_static,
+                                             self._num_columns_static))
+    if self._batch_shape_arg is None:
+      return matrix_shape
+
+    batch_shape = tensor_shape.TensorShape(self._batch_shape_static)
+    return batch_shape.concatenate(matrix_shape)
+
+  def _shape_tensor(self):
+    matrix_shape = array_ops.stack((self._num_rows, self._num_columns), axis=0)
+    if self._batch_shape_arg is None:
+      return matrix_shape
+
+    return array_ops.concat((self._batch_shape_arg, matrix_shape), 0)
+
+  def _assert_non_singular(self):
+    raise errors.InvalidArgumentError(
+        node_def=None, op=None, message="Zero operators are always "
+        "non-invertible.")
+
+  def _assert_positive_definite(self):
+    raise errors.InvalidArgumentError(
+        node_def=None, op=None, message="Zero operators are always "
+        "non-positive definite.")
+
+  def _assert_self_adjoint(self):
+    return control_flow_ops.no_op("assert_self_adjoint")
+
+  def _possibly_broadcast_batch_shape(self, x):
+    """Return 'x', possibly after broadcasting the leading dimensions."""
+    # If we have no batch shape, our batch shape broadcasts with everything!
+    if self._batch_shape_arg is None:
+      return x
+
+    # Static attempt:
+    #   If we determine that no broadcast is necessary, pass x through
+    #   If we need a broadcast, add to an array of zeros.
+    #
+    # special_shape is the shape that, when broadcast with x's shape, will give
+    # the correct broadcast_shape.  Note that
+    #   We have already verified the second to last dimension of self.shape
+    #   matches x's shape in assert_compatible_matrix_dimensions.
+    #   Also, the final dimension of 'x' can have any shape.
+    #   Therefore, the final two dimensions of special_shape are 1's.
+    special_shape = self.batch_shape.concatenate([1, 1])
+    bshape = array_ops.broadcast_static_shape(x.get_shape(), special_shape)
+    if special_shape.is_fully_defined():
+      # bshape.is_fully_defined iff special_shape.is_fully_defined.
+      if bshape == x.get_shape():
+        return x
+      # Use the built in broadcasting of addition.
+      zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
+      return x + zeros
+
+    # Dynamic broadcast:
+    #   Always add to an array of zeros, rather than using a "cond", since a
+    #   cond would require copying data from GPU --> CPU.
+    special_shape = array_ops.concat((self.batch_shape_tensor(), [1, 1]), 0)
+    zeros = array_ops.zeros(shape=special_shape, dtype=self.dtype)
+    return x + zeros
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    if self._assert_proper_shapes:
+      x = linalg.adjoint(x) if adjoint_arg else x
+      aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x)
+      x = control_flow_ops.with_dependencies([aps], x)
+    if self.is_square:
+      # Note that adjoint has no effect since this matrix is self-adjoint.
+      if adjoint_arg:
+        output_shape = array_ops.concat([
+            array_ops.shape(x)[:-2],
+            [array_ops.shape(x)[-1], array_ops.shape(x)[-2]]], axis=0)
+      else:
+        output_shape = array_ops.shape(x)
+
+      return self._possibly_broadcast_batch_shape(
+          array_ops.zeros(shape=output_shape, dtype=x.dtype))
+
+    x_shape = array_ops.shape(x)
+    n = self._num_columns if adjoint else self._num_rows
+    m = x_shape[-2] if adjoint_arg else x_shape[-1]
+
+    output_shape = array_ops.concat([x_shape[:-2], [n, m]], axis=0)
+
+    zeros = array_ops.zeros(shape=output_shape, dtype=x.dtype)
+    return self._possibly_broadcast_batch_shape(zeros)
+
+  def _determinant(self):
+    if self.batch_shape.is_fully_defined():
+      return array_ops.zeros(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _trace(self):
+    # Get Tensor of all zeros of same shape as self.batch_shape.
+    if self.batch_shape.is_fully_defined():
+      return array_ops.zeros(shape=self.batch_shape, dtype=self.dtype)
+    else:
+      return array_ops.zeros(shape=self.batch_shape_tensor(), dtype=self.dtype)
+
+  def _diag_part(self):
+    return self._zeros_diag()
+
+  def add_to_tensor(self, mat, name="add_to_tensor"):
+    """Add matrix represented by this operator to `mat`.  Equiv to `I + mat`.
+
+    Args:
+      mat:  `Tensor` with same `dtype` and shape broadcastable to `self`.
+      name:  A name to give this `Op`.
+
+    Returns:
+      A `Tensor` with broadcast shape and same `dtype` as `self`.
+    """
+    return self._possibly_broadcast_batch_shape(mat)
+
+  def _check_domain_range_possibly_add_asserts(self):
+    """Static check of init arg `num_rows`, possibly add asserts."""
+    # Possibly add asserts.
+    if self._assert_proper_shapes:
+      self._num_rows = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_rows,
+              0,
+              message="Argument num_rows must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_rows,
+              message="Argument num_rows must be non-negative."),
+      ], self._num_rows)
+      self._num_columns = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._num_columns,
+              0,
+              message="Argument num_columns must be a 0-D Tensor."),
+          check_ops.assert_non_negative(
+              self._num_columns,
+              message="Argument num_columns must be non-negative."),
+      ], self._num_columns)
+
+    # Static checks.
+    if not self._num_rows.dtype.is_integer:
+      raise TypeError("Argument num_rows must be integer type.  Found:"
+                      " %s" % self._num_rows)
+
+    if not self._num_columns.dtype.is_integer:
+      raise TypeError("Argument num_columns must be integer type.  Found:"
+                      " %s" % self._num_columns)
+
+    num_rows_static = self._num_rows_static
+    num_columns_static = self._num_columns_static
+
+    if num_rows_static is not None:
+      if num_rows_static.ndim != 0:
+        raise ValueError("Argument num_rows must be a 0-D Tensor.  Found:"
+                         " %s" % num_rows_static)
+
+      if num_rows_static < 0:
+        raise ValueError("Argument num_rows must be non-negative.  Found:"
+                         " %s" % num_rows_static)
+    if num_columns_static is not None:
+      if num_columns_static.ndim != 0:
+        raise ValueError("Argument num_columns must be a 0-D Tensor.  Found:"
+                         " %s" % num_columns_static)
+
+      if num_columns_static < 0:
+        raise ValueError("Argument num_columns must be non-negative.  Found:"
+                         " %s" % num_columns_static)
+
+  def _check_batch_shape_possibly_add_asserts(self):
+    """Static check of init arg `batch_shape`, possibly add asserts."""
+    if self._batch_shape_arg is None:
+      return
+
+    # Possibly add asserts
+    if self._assert_proper_shapes:
+      self._batch_shape_arg = control_flow_ops.with_dependencies([
+          check_ops.assert_rank(
+              self._batch_shape_arg,
+              1,
+              message="Argument batch_shape must be a 1-D Tensor."),
+          check_ops.assert_non_negative(
+              self._batch_shape_arg,
+              message="Argument batch_shape must be non-negative."),
+      ], self._batch_shape_arg)
+
+    # Static checks
+    if not self._batch_shape_arg.dtype.is_integer:
+      raise TypeError("Argument batch_shape must be integer type.  Found:"
+                      " %s" % self._batch_shape_arg)
+
+    if self._batch_shape_static is None:
+      return  # Cannot do any other static checks.
+
+    if self._batch_shape_static.ndim != 1:
+      raise ValueError("Argument batch_shape must be a 1-D Tensor.  Found:"
+                       " %s" % self._batch_shape_static)
+
+    if np.any(self._batch_shape_static < 0):
+      raise ValueError("Argument batch_shape must be non-negative.  Found:"
+                       "%s" % self._batch_shape_static)
+
+  def _min_matrix_dim(self):
+    """Minimum of domain/range dimension, if statically available, else None."""
+    domain_dim = self.domain_dimension.value
+    range_dim = self.range_dimension.value
+    if domain_dim is None or range_dim is None:
+      return None
+    return min(domain_dim, range_dim)
+
+  def _min_matrix_dim_tensor(self):
+    """Minimum of domain/range dimension, as a tensor."""
+    return math_ops.reduce_min(self.shape_tensor()[-2:])
+
+  def _zeros_diag(self):
+    """Returns the diagonal of this operator as all zeros."""
+    if self.shape.is_fully_defined():
+      d_shape = self.batch_shape.concatenate([self._min_matrix_dim()])
+    else:
+      d_shape = array_ops.concat(
+          [self.batch_shape_tensor(),
+           [self._min_matrix_dim_tensor()]], axis=0)
+
+    return array_ops.zeros(shape=d_shape, dtype=self.dtype)
diff --git a/tensorflow/python/ops/parallel_for/__init__.py b/tensorflow/python/ops/parallel_for/__init__.py
index b49d865968b0bab02380cb934431f4933590570e..dd8bc6d487f625c9ab442c91da417dce00074a2a 100644
--- a/tensorflow/python/ops/parallel_for/__init__.py
+++ b/tensorflow/python/ops/parallel_for/__init__.py
@@ -23,13 +23,3 @@ from tensorflow.python.ops.parallel_for.control_flow_ops import for_loop
 from tensorflow.python.ops.parallel_for.control_flow_ops import pfor
 from tensorflow.python.ops.parallel_for.gradients import batch_jacobian
 from tensorflow.python.ops.parallel_for.gradients import jacobian
-from tensorflow.python.util.all_util import remove_undocumented
-
-_allowed_symbols = [
-    'pfor',
-    'for_loop',
-    'jacobian',
-    'batch_jacobian',
-]
-
-remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 5979b76ff24a8ec3b1a34d57cf1bfb89448fa0e8..8b259b6b6b3fc7198c496a2ab3c70aa8ea1fe8c6 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -196,15 +196,16 @@ class ResourceVariable(variables.RefVariable):
   the variable are fixed. The value can be changed using one of the assign
   methods.
 
-  Just like any `Tensor`, variables created with `ResourceVariable()` can be
-  used as inputs for other Ops in the graph. Additionally, all the operators
-  overloaded for the `Tensor` class are carried over to variables, so you can
-  also add nodes to the graph by just doing arithmetic on variables.
+  Just like any `Tensor`, variables created with
+  `tf.Variable(use_resource=True)` can be used as inputs for other Ops in the
+  graph. Additionally, all the operators overloaded for the `Tensor` class are
+  carried over to variables, so you can also add nodes to the graph by just
+  doing arithmetic on variables.
 
-  Unlike tf.Variable, a tf.ResourceVariable has well-defined semantics. Each
+  Unlike ref-based variable, a ResourceVariable has well-defined semantics. Each
   usage of a ResourceVariable in a TensorFlow graph adds a read_value operation
-  to the graph. The Tensors returned by a read_value operation are guaranteed
-  to see all modifications to the value of the variable which happen in any
+  to the graph. The Tensors returned by a read_value operation are guaranteed to
+  see all modifications to the value of the variable which happen in any
   operation on which the read_value depends on (either directly, indirectly, or
   via a control dependency) and guaranteed to not see any modification to the
   value of the variable from operations that depend on the read_value operation.
@@ -218,7 +219,7 @@ class ResourceVariable(variables.RefVariable):
   can cause tf.Variable and tf.ResourceVariable to behave differently:
 
   ```python
-  a = tf.ResourceVariable(1.0)
+  a = tf.Variable(1.0, use_resource=True)
   a.initializer.run()
 
   assign = a.assign(2.0)
@@ -742,8 +743,14 @@ class ResourceVariable(variables.RefVariable):
   def _read_variable_op(self):
     if self.trainable:
       tape.watch_variable(self)
-    return gen_resource_variable_ops.read_variable_op(self._handle,
-                                                      self._dtype)
+    result = gen_resource_variable_ops.read_variable_op(self._handle,
+                                                        self._dtype)
+    if not context.executing_eagerly():
+      # Note that if a control flow context is active the input of the read op
+      # might not actually be the handle. This line bypasses it.
+      tape.record_operation(
+          "ReadVariableOp", [result], [self._handle], lambda x: [x])
+    return result
 
   def read_value(self):
     """Constructs an op which reads the value of this variable.
@@ -1294,16 +1301,3 @@ def is_resource_variable(var):
   """"Returns True if `var` is to be considered a ResourceVariable."""
   return isinstance(var, ResourceVariable) or hasattr(
       var, "_should_act_as_resource_variable")
-
-
-_DEFAULT_USE_RESOURCE = False
-
-
-def _default_variable_creator(_, *args, **kwds):
-  use_resource = kwds.pop("use_resource", _DEFAULT_USE_RESOURCE)
-  use_resource = use_resource or context.executing_eagerly()
-  if use_resource:
-    return ResourceVariable(*args, **kwds)
-  return variables.RefVariable(*args, **kwds)
-
-variables.default_variable_creator = _default_variable_creator
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index deba133fb9910f28c7f902f334174734c3c742f7..7b6ab20975114da5599aabdd0c6e664191f5fe48 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -417,24 +417,30 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
 
     # Backward direction
     if not time_major:
-      time_dim = 1
-      batch_dim = 0
+      time_axis = 1
+      batch_axis = 0
     else:
-      time_dim = 0
-      batch_dim = 1
+      time_axis = 0
+      batch_axis = 1
 
-    def _reverse(input_, seq_lengths, seq_dim, batch_dim):
+    def _reverse(input_, seq_lengths, seq_axis, batch_axis):
       if seq_lengths is not None:
         return array_ops.reverse_sequence(
             input=input_, seq_lengths=seq_lengths,
-            seq_dim=seq_dim, batch_dim=batch_dim)
+            seq_axis=seq_axis, batch_axis=batch_axis)
       else:
-        return array_ops.reverse(input_, axis=[seq_dim])
+        return array_ops.reverse(input_, axis=[seq_axis])
 
     with vs.variable_scope("bw") as bw_scope:
-      inputs_reverse = _reverse(
-          inputs, seq_lengths=sequence_length,
-          seq_dim=time_dim, batch_dim=batch_dim)
+
+      def _map_reverse(inp):
+        return _reverse(
+            inp,
+            seq_lengths=sequence_length,
+            seq_axis=time_axis,
+            batch_axis=batch_axis)
+
+      inputs_reverse = nest.map_structure(_map_reverse, inputs)
       tmp, output_state_bw = dynamic_rnn(
           cell=cell_bw, inputs=inputs_reverse, sequence_length=sequence_length,
           initial_state=initial_state_bw, dtype=dtype,
@@ -443,7 +449,7 @@ def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
 
   output_bw = _reverse(
       tmp, seq_lengths=sequence_length,
-      seq_dim=time_dim, batch_dim=batch_dim)
+      seq_axis=time_axis, batch_axis=batch_axis)
 
   outputs = (output_fw, output_bw)
   output_states = (output_state_fw, output_state_bw)
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 70805fd5725d66e430005f8a43396b99106607c8..42806ba6ec486b88085ddc063c82a6873a1b23c8 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -1261,6 +1261,11 @@ class MultiRNNCell(RNNCell):
       raise TypeError(
           "cells must be a list or tuple, but saw: %s." % cells)
 
+    if len(set([id(cell) for cell in cells])) < len(cells):
+      logging.log_first_n(logging.WARN,
+                          "At least two cells provided to MultiRNNCell "
+                          "are the same object and will share weights.", 1)
+
     self._cells = cells
     for cell_number, cell in enumerate(self._cells):
       # Add Checkpointable dependencies on these cells so their variables get
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index cc92da4fd7afd49d0dd80bd859d7393f2761303f..f86dfb35276f608c5cb323fe5deceb58733be007 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -554,7 +554,7 @@ class _EagerTensorArray(object):
       self._tensor_array.extend([None for _ in range(index - size + 1)])
 
     if not isinstance(value, ops.EagerTensor):
-      value = constant_op.constant(value)
+      value = ops.convert_to_tensor(value)
 
     if self._infer_shape:
       if self._element_shape is None:
@@ -633,8 +633,8 @@ class _EagerTensorArray(object):
   def split(self, value, lengths, name=None):
     """See TensorArray."""
     # error checking to match graph-mode errors
-    value = constant_op.constant(value)
-    lengths = constant_op.constant(lengths)
+    value = ops.convert_to_tensor(value)
+    lengths = ops.convert_to_tensor(lengths)
     sum_lengths = math_ops.reduce_sum(lengths)
     if lengths.shape.ndims != 1:
       raise errors_impl.InvalidArgumentError(
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 0f37dcc0277b844aa5bfd93e26d49e50e908d3a6..aca44bcd449d05db5885768391262284e61bf07b 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -2349,7 +2349,10 @@ def default_variable_creator(next_creator=None, **kwargs):
   validate_shape = kwargs.get("validate_shape", True)
   caching_device = kwargs.get("caching_device", None)
   name = kwargs.get("name", None)
+  variable_def = kwargs.get("variable_def", None)
   dtype = kwargs.get("dtype", None)
+  expected_shape = kwargs.get("expected_shape", None)
+  import_scope = kwargs.get("import_scope", None)
   constraint = kwargs.get("constraint", None)
   use_resource = kwargs.get("use_resource", None)
 
@@ -2360,23 +2363,24 @@ def default_variable_creator(next_creator=None, **kwargs):
 
   if use_resource is None:
     use_resource = get_variable_scope().use_resource
-  if use_resource or (use_resource is None and context.executing_eagerly()):
+  use_resource = use_resource or context.executing_eagerly()
+  if use_resource:
     return resource_variable_ops.ResourceVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
         caching_device=caching_device, name=name, dtype=dtype,
-        constraint=constraint)
-  elif not use_resource and context.executing_eagerly():
-    raise RuntimeError(
-        "VariableScope should use resource variable when eager execution is"
-        " enabled, but use_resource is False."
-    )
+        constraint=constraint, variable_def=variable_def,
+        import_scope=import_scope)
   else:
-    return variables.Variable(
+    return variables.RefVariable(
         initial_value=initial_value, trainable=trainable,
         collections=collections, validate_shape=validate_shape,
         caching_device=caching_device, name=name, dtype=dtype,
-        constraint=constraint)
+        constraint=constraint, variable_def=variable_def,
+        expected_shape=expected_shape, import_scope=import_scope)
+
+
+variables.default_variable_creator = default_variable_creator
 
 
 def _make_getter(captured_getter, captured_previous):
@@ -2384,36 +2388,8 @@ def _make_getter(captured_getter, captured_previous):
   return lambda **kwargs: captured_getter(captured_previous, **kwargs)
 
 
-def variable(initial_value=None,
-             trainable=None,
-             collections=None,
-             validate_shape=True,
-             caching_device=None,
-             name=None,
-             dtype=None,
-             constraint=None,
-             use_resource=None,
-             synchronization=VariableSynchronization.AUTO,
-             aggregation=VariableAggregation.NONE):
-  previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
-  for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
-    previous_getter = _make_getter(getter, previous_getter)
-
-  # Reset `aggregation` that is explicitly set as `None` to the enum None value.
-  if aggregation is None:
-    aggregation = VariableAggregation.NONE
-  return previous_getter(
-      initial_value=initial_value,
-      trainable=trainable,
-      collections=collections,
-      validate_shape=validate_shape,
-      caching_device=caching_device,
-      name=name,
-      dtype=dtype,
-      constraint=constraint,
-      use_resource=use_resource,
-      synchronization=synchronization,
-      aggregation=aggregation)
+# TODO(apassos) remove forwarding symbol
+variable = variables.Variable
 
 
 @tf_contextlib.contextmanager
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 6bb2d6f66963156f712d452547c3a474a8f6302b..fc00ce68aeaf49ea88b1a40ee40ecebe69bb0eee 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -40,15 +40,15 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-def default_variable_creator(_, *args, **kwds):
-  del args, kwds
-  raise NotImplementedError("resource_variable_ops needs to be imported")
+def default_variable_creator(_, **kwds):
+  del kwds
+  raise NotImplementedError("variable_scope needs to be imported")
 
 
 def _make_getter(captured_getter, captured_previous):
   """To avoid capturing loop variables."""
-  def getter(*args, **kwargs):
-    return captured_getter(captured_previous, *args, **kwargs)
+  def getter(**kwargs):
+    return captured_getter(captured_previous, **kwargs)
   return getter
 
 
@@ -86,11 +86,48 @@ class VariableAggregation(enum.Enum):
 class VariableMetaclass(type):
   """Metaclass to allow construction of tf.Variable to be overridden."""
 
+  def _variable_call(cls,
+                     initial_value=None,
+                     trainable=None,
+                     collections=None,
+                     validate_shape=True,
+                     caching_device=None,
+                     name=None,
+                     variable_def=None,
+                     dtype=None,
+                     expected_shape=None,
+                     import_scope=None,
+                     constraint=None,
+                     use_resource=None,
+                     synchronization=VariableSynchronization.AUTO,
+                     aggregation=VariableAggregation.NONE):
+    """Call on Variable class. Useful to force the signature."""
+    previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
+    for getter in ops.get_default_graph()._variable_creator_stack:  # pylint: disable=protected-access
+      previous_getter = _make_getter(getter, previous_getter)
+
+    # Reset `aggregation` that is explicitly set as `None` to the enum NONE.
+    if aggregation is None:
+      aggregation = VariableAggregation.NONE
+    return previous_getter(
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        variable_def=variable_def,
+        dtype=dtype,
+        expected_shape=expected_shape,
+        import_scope=import_scope,
+        constraint=constraint,
+        use_resource=use_resource,
+        synchronization=synchronization,
+        aggregation=aggregation)
+
   def __call__(cls, *args, **kwargs):
     if cls is Variable:
-      previous_getter = lambda *a, **k: default_variable_creator(None, *a, **k)
-      # TODO(apassos) use a stack of getters here
-      return previous_getter(*args, **kwargs)
+      return cls._variable_call(*args, **kwargs)
     else:
       return super(VariableMetaclass, cls).__call__(*args, **kwargs)
 
@@ -183,27 +220,31 @@ class Variable(six.with_metaclass(VariableMetaclass,
   various `Optimizer` classes use this collection as the default list of
   variables to optimize.
 
-  WARNING: tf.Variable objects have a non-intuitive memory model. A Variable is
-  represented internally as a mutable Tensor which can non-deterministically
-  alias other Tensors in a graph. The set of operations which consume a Variable
-  and can lead to aliasing is undetermined and can change across TensorFlow
-  versions. Avoid writing code which relies on the value of a Variable either
-  changing or not changing as other operations happen. For example, using
-  Variable objects or simple functions thereof as predicates in a `tf.cond` is
-  dangerous and error-prone:
+  WARNING: tf.Variable objects by default have a non-intuitive memory model. A
+  Variable is represented internally as a mutable Tensor which can
+  non-deterministically alias other Tensors in a graph. The set of operations
+  which consume a Variable and can lead to aliasing is undetermined and can
+  change across TensorFlow versions. Avoid writing code which relies on the
+  value of a Variable either changing or not changing as other operations
+  happen. For example, using Variable objects or simple functions thereof as
+  predicates in a `tf.cond` is dangerous and error-prone:
 
   ```
   v = tf.Variable(True)
   tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
   ```
 
-  Here replacing tf.Variable with tf.contrib.eager.Variable will fix any
-  nondeterminism issues.
+  Here replacing adding `use_resource=True` when constructing the variable will
+  fix any nondeterminism issues:
+  ```
+  v = tf.Variable(True, use_resource=True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)
+  ```
 
   To use the replacement for variables which does
   not have these issues:
 
-  * Replace `tf.Variable` with `tf.contrib.eager.Variable`;
+  * Add `use_resource=True` when constructing `tf.Variable`;
   * Call `tf.get_variable_scope().set_use_resource(True)` inside a
     `tf.variable_scope` before the `tf.get_variable()` call.
   """
@@ -650,8 +691,8 @@ class Variable(six.with_metaclass(VariableMetaclass,
   @staticmethod
   def from_proto(variable_def, import_scope=None):
     """Returns a `Variable` object created from `variable_def`."""
-    return Variable(variable_def=variable_def,
-                    import_scope=import_scope)
+    return RefVariable(variable_def=variable_def,
+                       import_scope=import_scope)
 
   class SaveSliceInfo(object):
     """Information on how to save this Variable as a slice.
@@ -832,19 +873,7 @@ class RefVariable(Variable):
       ValueError: If the initial value is not specified, or does not have a
         shape and `validate_shape` is `True`.
       RuntimeError: If eager execution is enabled.
-
-    @compatibility(eager)
-    `tf.Variable` is not compatible with eager execution.  Use
-    `tfe.Variable` instead which is compatible with both eager execution
-    and graph construction.  See [the TensorFlow Eager Execution
-    guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/g3doc/guide.md#variables-and-optimizers)
-    for details on how variables work in eager execution.
-    @end_compatibility
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.Variable not supported when eager execution is enabled. "
-          "Please use tf.contrib.eager.Variable instead")
     self._in_graph_mode = True
     if variable_def:
       # If variable_def is provided, recreates the variable from its fields.
@@ -955,8 +984,7 @@ class RefVariable(Variable):
       # Ensure that we weren't lifted into the eager context.
       if context.executing_eagerly():
         raise RuntimeError(
-            "tf.Variable not supported when eager execution is enabled. "
-            "Please use tf.contrib.eager.Variable instead")
+            "RefVariable not supported when eager execution is enabled. ")
       with ops.name_scope(name, "Variable", [] if init_from_fn else
                           [initial_value]) as name:
 
diff --git a/tensorflow/python/profiler/profile_context.py b/tensorflow/python/profiler/profile_context.py
index 18eb66ef988c9f49eb04264545d417d8a986e16e..fa4260a7120d72eacff32a7b4960b34545eb32e5 100644
--- a/tensorflow/python/profiler/profile_context.py
+++ b/tensorflow/python/profiler/profile_context.py
@@ -88,16 +88,19 @@ def _profiled_run(self,
       to_profiles = self.profile_context._profile_candidates()
       for to_prof in to_profiles:
         cmd, opts, _ = to_prof
+        saved_views = self.profile_context._views.setdefault(cmd, {})
         if self.profile_context._debug:
           sys.stderr.write('debug: profiling %s step: %d\n' % (cmd, step))
         if cmd == 'graph':
-          self.profile_context.profiler.profile_graph(opts)
+          saved_views[step] = self.profile_context.profiler.profile_graph(opts)
         elif cmd == 'scope':
-          self.profile_context.profiler.profile_name_scope(opts)
+          saved_views[step] = self.profile_context.profiler.profile_name_scope(
+              opts)
         elif cmd == 'op':
-          self.profile_context.profiler.profile_operations(opts)
+          saved_views[step] = self.profile_context.profiler.profile_operations(
+              opts)
         elif cmd == 'code':
-          self.profile_context.profiler.profile_python(opts)
+          saved_views[step] = self.profile_context.profiler.profile_python(opts)
         else:
           raise ValueError('Unknown cmd: %s\n' % cmd)
       return ret
@@ -185,8 +188,30 @@ class ProfileContext(object):
     self._traced_steps = 0
     self._auto_profiles = []
     self._profiler = None
+    self._views = {}
     self._lock = threading.Lock()
 
+  def get_profiles(self, cmd):
+    """Returns profiling results for each step at which `cmd` was run.
+
+    Args:
+      cmd: string, profiling command used in an `add_auto_profiling` call.
+
+    Returns:
+      dict[int: (MultiGraphNodeProto | GraphNodeProto)]. Keys are steps at which
+      the profiling command was run. Values are the outputs of profiling.
+      For "code" and "op" commands this will be a `MultiGraphNodeProto`, for
+      "scope" and "graph" commands this will be a `GraphNodeProto.
+
+    Raises:
+      ValueError: if `cmd` was never run (either because no session.run call was
+      made or because there was no `add_auto_profiling` call with the specified
+      `cmd`.
+    """
+    if cmd not in self._views:
+      raise ValueError('No autoprofiler for command: {}, was run'.format(cmd))
+    return self._views[cmd]
+
   def add_auto_profiling(self, cmd, options, profile_steps):
     """Traces and profiles at some session run steps.
 
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index a623beee23ebf98cf96bd0f334f813db5ae04040..107ad443c32e20ab69f3c2fb71c652d97a9c0cc6 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -61,6 +61,8 @@ class ProfilerContextTest(test.TestCase):
               profile_str = f.read()
             gfile.Remove(outfile)
 
+      self.assertEqual(set([15, 50, 100]), set(pctx.get_profiles("op").keys()))
+
     with lib.ProfilerFromFile(
         os.path.join(test.get_temp_dir(), "profile_100")) as profiler:
       profiler.profile_operations(options=opts)
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index e58be804c2738dbad0e2f90c21d6eff3832a8148..8c985a7c2fa2b515c2daed1349996dd30f6d7ce1 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -34,6 +34,7 @@ from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -133,39 +134,32 @@ class SavedModelBuilder(object):
     tf_logging.info("Assets written to: %s",
                     compat.as_text(assets_destination_dir))
 
-  def _maybe_add_legacy_init_op(self, legacy_init_op=None):
-    """Add legacy init op to the SavedModel.
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
 
     Args:
-      legacy_init_op: Optional legacy init op to support backward compatibility.
+      main_op: Main op to run as part of graph initialization. If None, no
+        main op will be added to the graph.
 
     Raises:
-      TypeError if legacy init op is not of type `Operation`.
-      AssertionError if the graph already contains one or more legacy init ops.
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
     """
-    if legacy_init_op is not None:
-      if not isinstance(legacy_init_op, ops.Operation):
-        raise TypeError("legacy_init_op needs to be an Operation: %r" %
-                        legacy_init_op)
-      if ops.get_collection(constants.LEGACY_INIT_OP_KEY):
-        raise AssertionError(
-            "graph already contains one or more legacy init ops under the "
-            "collection {}.".format(constants.LEGACY_INIT_OP_KEY))
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY, legacy_init_op)
-
-  def _add_main_op(self, main_op):
-    """Add main op to the SavedModel.
+    if main_op is None:
+      return
 
-    Args:
-      main_op: Main op to run as part of graph initialization.
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
 
-    Raises:
-      TypeError if main op is not of type `Operation`.
-    """
-    if main_op is not None:
-      if not isinstance(main_op, ops.Operation):
-        raise TypeError("main_op needs to be an Operation: %r" % main_op)
-      ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
   def _add_train_op(self, train_op):
     """Add train op to the SavedModel.
@@ -257,16 +251,12 @@ class SavedModelBuilder(object):
           self._validate_tensor_info(outputs[outputs_key])
 
   def _add_collections(
-      self, assets_collection, legacy_init_op, main_op, train_op):
+      self, assets_collection, main_op, train_op):
     """Add asset and op collections to be saved."""
     # Save asset files and write them to disk, if any.
     self._save_and_write_assets(assets_collection)
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    self._maybe_add_main_op(main_op)
 
     self._add_train_op(train_op)
 
@@ -282,6 +272,9 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -306,7 +299,7 @@ class SavedModelBuilder(object):
           that this collection should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -333,8 +326,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     saver = self._maybe_create_saver(saver)
 
@@ -351,6 +348,9 @@ class SavedModelBuilder(object):
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
@@ -378,7 +378,7 @@ class SavedModelBuilder(object):
         def.
       assets_collection: Assets collection to be saved with SavedModel.
       legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load.
+          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
       main_op: Op or group of ops to execute when the graph is loaded. Note
@@ -402,8 +402,12 @@ class SavedModelBuilder(object):
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
     # Add assets and ops
-    self._add_collections(assets_collection, legacy_init_op, main_op, None)
+    self._add_collections(assets_collection, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 61c6ffbd0d11ef48c6dfb8d14a4328df7f7c5df5..cb251f08bb56fd5496ea4f3aaedfd2822ae1565c 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -60,6 +60,10 @@ SAVED_MODEL_FILENAME_PBTXT = "saved_model.pbtxt"
 tf_export("saved_model.constants.SAVED_MODEL_FILENAME_PBTXT").export_constant(
     __name__, "SAVED_MODEL_FILENAME_PBTXT")
 
+# File name for json format of SavedModel.
+# Not exported while keras_saved_model is in contrib.
+SAVED_MODEL_FILENAME_JSON = "saved_model.json"
+
 # Subdirectory name containing the variables/checkpoint files.
 VARIABLES_DIRECTORY = "variables"
 tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
@@ -69,5 +73,3 @@ tf_export("saved_model.constants.VARIABLES_DIRECTORY").export_constant(
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
-
-
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index e5f649fdabb5cc2600a6fdd0e5ed9950d6bb23c2..16077f52fab72e7700df7e67782a549bbde21751 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -116,11 +116,14 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   return asset_tensor_dict
 
 
-def _get_main_op_tensor(meta_graph_def_to_load):
+def _get_main_op_tensor(
+    meta_graph_def_to_load, init_op_key=constants.MAIN_OP_KEY):
   """Gets the main op tensor, if one exists.
 
   Args:
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    init_op_key: name of collection to check; should be one of MAIN_OP_KEY
+      or the deprecated LEGACY_INIT_OP_KEY
 
   Returns:
     The main op tensor, if it exists and `None` otherwise.
@@ -131,38 +134,15 @@ def _get_main_op_tensor(meta_graph_def_to_load):
   """
   collection_def = meta_graph_def_to_load.collection_def
   main_op_tensor = None
-  if constants.MAIN_OP_KEY in collection_def:
-    main_ops = collection_def[constants.MAIN_OP_KEY].node_list.value
+  if init_op_key in collection_def:
+    main_ops = collection_def[init_op_key].node_list.value
     if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op.")
-    main_op_tensor = ops.get_collection(constants.MAIN_OP_KEY)[0]
+      raise RuntimeError("Expected exactly one SavedModel main op. "
+                         "Found: {}".format(main_ops))
+    main_op_tensor = ops.get_collection(init_op_key)[0]
   return main_op_tensor
 
 
-def _get_legacy_init_op_tensor(meta_graph_def_to_load):
-  """Gets the legacy init op tensor, if one exists.
-
-  Args:
-    meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
-
-  Returns:
-    The legacy init op tensor, if it exists and `None` otherwise.
-
-  Raises:
-    RuntimeError: If the collection def corresponding to the legacy init op key
-        has other than exactly one tensor.
-  """
-  collection_def = meta_graph_def_to_load.collection_def
-  legacy_init_op_tensor = None
-  if constants.LEGACY_INIT_OP_KEY in collection_def:
-    legacy_init_ops = collection_def[
-        constants.LEGACY_INIT_OP_KEY].node_list.value
-    if len(legacy_init_ops) != 1:
-      raise RuntimeError("Expected exactly one legacy serving init op.")
-    legacy_init_op_tensor = ops.get_collection(constants.LEGACY_INIT_OP_KEY)[0]
-  return legacy_init_op_tensor
-
-
 @tf_export("saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
   """Checks whether the provided export directory could contain a SavedModel.
@@ -284,12 +264,15 @@ class SavedModelLoader(object):
       **saver_kwargs: keyword arguments to pass to tf.train.import_meta_graph.
 
     Returns:
-      Saver defined by the MetaGraph, which can be used to restore the variable
-      values.
+      A tuple of
+        * Saver defined by the MetaGraph, which can be used to restore the
+          variable values.
+        * List of `Operation`/`Tensor` objects returned from
+          `tf.import_graph_def` (may be `None`).
     """
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     with graph.as_default():
-      return tf_saver.import_meta_graph(
+      return tf_saver._import_meta_graph_with_return_elements(  # pylint: disable=protected-access
           meta_graph_def, import_scope=import_scope, **saver_kwargs)
 
   def restore_variables(self, sess, saver, import_scope=None):
@@ -340,8 +323,8 @@ class SavedModelLoader(object):
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
       main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def) or
-          (_get_legacy_init_op_tensor(meta_graph_def)))
+          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
       if main_op_tensor is not None:
         sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
 
@@ -361,8 +344,8 @@ class SavedModelLoader(object):
       `MetagraphDef` proto of the graph that was loaded.
     """
     with sess.graph.as_default():
-      saver = self.load_graph(sess.graph, tags, import_scope,
-                              **saver_kwargs)
+      saver, _ = self.load_graph(sess.graph, tags, import_scope,
+                                 **saver_kwargs)
       self.restore_variables(sess, saver, import_scope)
       self.run_init_ops(sess, tags, import_scope)
     return self.get_meta_graph_def_from_tags(tags)
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index ce18859f6b9e4c141c4b27f3643c8d4004eb56f6..9a0b276a4b20390ad6bae012e4d61e39c57ac4fc 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -111,7 +111,8 @@ class SavedModelLoaderTest(test.TestCase):
   def test_load_with_import_scope(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.test_session(graph=ops.Graph()) as sess:
-      saver = loader.load_graph(sess.graph, ["foo_graph"], import_scope="baz")
+      saver, _ = loader.load_graph(
+          sess.graph, ["foo_graph"], import_scope="baz")
 
       # The default saver should not work when the import scope is set.
       with self.assertRaises(errors.NotFoundError):
@@ -149,7 +150,7 @@ class SavedModelLoaderTest(test.TestCase):
   def test_run_init_op(self):
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
-    saver = loader.load_graph(graph, ["foo_graph"])
+    saver, _ = loader.load_graph(graph, ["foo_graph"])
     with self.test_session(graph=graph) as sess:
       loader.restore_variables(sess, saver)
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
@@ -203,7 +204,7 @@ class SavedModelLoaderTest(test.TestCase):
 
     loader = loader_impl.SavedModelLoader(path)
     with self.test_session(graph=ops.Graph()) as sess:
-      saver = loader.load_graph(sess.graph, ["foo_graph"])
+      saver, _ = loader.load_graph(sess.graph, ["foo_graph"])
       self.assertFalse(variables._all_saveable_objects())
       self.assertIsNotNone(saver)
 
@@ -212,6 +213,18 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
+  def test_load_saved_model_graph_with_return_elements(self):
+    """Ensure that the correct elements are returned."""
+    loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
+    graph = ops.Graph()
+    _, ret = loader.load_graph(graph, ["foo_graph"],
+                               return_elements=["y:0", "x:0"])
+
+    self.assertEqual(graph.get_tensor_by_name("y:0"), ret[0])
+    self.assertEqual(graph.get_tensor_by_name("x:0"), ret[1])
+
+    with self.assertRaisesRegexp(ValueError, "not found in graph"):
+      loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index fb4732aca21d4661aaea21a472475690687a42be..00b669fc97950d25a6e29f728c649f4dba482162 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -846,9 +846,19 @@ class SavedModelTest(test.TestCase):
   def testLegacyInitOpWithNonEmptyCollection(self):
     export_dir = self._get_export_dir(
         "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(
+        export_dir, constants.LEGACY_INIT_OP_KEY)
+
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    with self.test_session(graph=ops.Graph()) as sess:
+    g = ops.Graph()
+    with self.test_session(graph=g) as sess:
       # Initialize variable `v1` to 1.
       v1 = variables.Variable(1, name="v1")
       ops.add_to_collection("v", v1)
@@ -857,19 +867,21 @@ class SavedModelTest(test.TestCase):
       v2 = variables.Variable(42, name="v2", trainable=False, collections=[])
       ops.add_to_collection("v", v2)
 
-      # Set up an assignment op to be run as part of the legacy_init_op.
+      # Set up an assignment op to be run as part of the init op.
       assign_v2 = state_ops.assign(v2, v1)
-      legacy_init_op = control_flow_ops.group(assign_v2, name="legacy_init_op")
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
 
       sess.run(variables.global_variables_initializer())
 
-      ops.add_to_collection(constants.LEGACY_INIT_OP_KEY,
-                            control_flow_ops.no_op())
-      # AssertionError should be raised since the LEGACY_INIT_OP_KEY collection
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
       # is not empty and we don't support multiple init ops.
-      with self.assertRaises(AssertionError):
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
         builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=legacy_init_op)
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
 
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
diff --git a/tensorflow/python/summary/writer/writer.py b/tensorflow/python/summary/writer/writer.py
index aca084fc9168e710316e4c988594cff69e54ebab..60e96ee947506d5b020ad1ed580a5da0c4e6bdec 100644
--- a/tensorflow/python/summary/writer/writer.py
+++ b/tensorflow/python/summary/writer/writer.py
@@ -325,7 +325,7 @@ class FileWriter(SummaryToEventTransformer):
     ```
 
     The `session` argument to the constructor makes the returned `FileWriter` a
-    a compatibility layer over new graph-based summaries (`tf.contrib.summary`).
+    compatibility layer over new graph-based summaries (`tf.contrib.summary`).
     Crucially, this means the underlying writer resource and events file will
     be shared with any other `FileWriter` using the same `session` and `logdir`,
     and with any `tf.contrib.summary.SummaryWriter` in this session using the
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index b65c88e972454da14dc5161a19cd26280d51d28f..bcbe5907d6370e0c0a268c2ea6a2f10bdf30683e 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -109,12 +109,13 @@ class AdamOptimizer(optimizer.Optimizer):
     self._updated_lr = None
 
   def _get_beta_accumulators(self):
-    if context.executing_eagerly():
-      graph = None
-    else:
-      graph = ops.get_default_graph()
-    return (self._get_non_slot_variable("beta1_power", graph=graph),
-            self._get_non_slot_variable("beta2_power", graph=graph))
+    with ops.init_scope():
+      if context.executing_eagerly():
+        graph = None
+      else:
+        graph = ops.get_default_graph()
+      return (self._get_non_slot_variable("beta1_power", graph=graph),
+              self._get_non_slot_variable("beta2_power", graph=graph))
 
   def _create_slots(self, var_list):
     # Create the beta1 and beta2 accumulators on the same device as the first
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index ccdc7e384da2ae792a681298c7076fc582d362df..8f844276540bf5b5cc9a61a3cb072b3cfa9cfa7a 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -315,6 +315,12 @@ class AdamOptimizerTest(test.TestCase):
 
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+      grads0 = constant_op.constant(np.array([0.1, 0.1]))
+      optimizer.apply_gradients([(grads0, var0)])
+
     g = ops.Graph()
     with g.as_default():
       with session.Session():
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 883f4fd91090a7a0518829cb4276442aa4da9cfc..a052081630f34fe28e4d650e1752cd723fa65731 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -24,7 +24,6 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -308,32 +307,19 @@ def _set_checkpoint_initializer(variable,
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
 
-    # TODO(priyag, allenl): Use `SaveableObject.restore` instead here.
-    if resource_variable_ops.is_resource_variable(variable):
-      init_op = variable.assign(restore_op, read_value=False)
-    else:
-      init_op = state_ops.assign(variable, restore_op)
+    names_to_saveables = saver.BaseSaverBuilder.OpListToDict([variable])
+    saveable_objects = []
+    for name, op in names_to_saveables.items():
+      for s in saver.BaseSaverBuilder.SaveableObjectsForOp(op, name):
+        saveable_objects.append(s)
+
+    assert len(saveable_objects) == 1  # Should be only one variable.
+    init_op = saveable_objects[0].restore([restore_op], restored_shapes=None)
 
     # pylint:disable=protected-access
-    # We need special handling for `DistributedVariable`s as they contain
-    # mutliple actual variables. `assign` on a `DistributedVariable` returns a
-    # combined `init_op` which contains initializers for all the contained
-    # variables. We then set each underlying variable's `_initializer_op` using
-    # the corresponding `init_op`.
-    # TODO(priyag): Use `isinstance` checks when `DistributedVariable` class
-    # moves out of contrib.
-    if any(base.__name__ == "DistributedVariable"
-           for base in  variable.__class__.__bases__):
-      assert distribute_lib.get_cross_tower_context()
-      assert hasattr(variable, "_index")
-      for (d, v) in six.iteritems(variable._index):
-        v._initializer_op = init_op._index[d]
-        restore_op.set_shape(v.shape)
-        v._initial_value = restore_op
-    else:
-      variable._initializer_op = init_op
-      restore_op.set_shape(variable.shape)
-      variable._initial_value = restore_op
+    variable._initializer_op = init_op
+    restore_op.set_shape(variable.shape)
+    variable._initial_value = restore_op
     # pylint:enable=protected-access
 
 
diff --git a/tensorflow/python/training/checkpoint_utils_test.py b/tensorflow/python/training/checkpoint_utils_test.py
index 4e08a1c859fbaac75e7cd09ad498d9fea14c6338..1c1f126ce94614ae02d701a4a15c8956e3dd4f2f 100644
--- a/tensorflow/python/training/checkpoint_utils_test.py
+++ b/tensorflow/python/training/checkpoint_utils_test.py
@@ -386,7 +386,9 @@ class CheckpointsTest(test.TestCase):
         op for op in g.get_operations()
         if (op.name.startswith("init_from_checkpoint/") and
             not op.name.startswith("init_from_checkpoint/checkpoint_initializer"
-                                  ) and op.type != "AssignVariableOp")
+                                  ) and
+            op.type != "AssignVariableOp" and
+            op.type != "Identity")
     ]
     self.assertEqual(ops_in_init_from_checkpoint_scope, [])
 
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index ee35b01328436911fd7926b25b14433377ec4188..66837ee52fd6d1b6b2bb98b82a0b2f293879c7e0 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -144,7 +144,7 @@ class _CheckpointPosition(object):
         # process deferred restorations for it and its dependencies.
         restore_ops = checkpointable._restore_from_checkpoint_position(self)  # pylint: disable=protected-access
         if restore_ops:
-          self._checkpoint.restore_ops.extend(restore_ops)
+          self._checkpoint.new_restore_ops(restore_ops)
 
   def bind_object(self, checkpointable):
     """Set a checkpoint<->object correspondence and process slot variables.
@@ -501,12 +501,6 @@ class CheckpointableBase(object):
       ValueError: If the variable name is not unique.
     """
     self._maybe_initialize_checkpointable()
-    if overwrite and self._lookup_dependency(name) is not None:
-      raise ValueError(
-          ("A variable named '%s' already exists in this Checkpointable, but "
-           "Checkpointable._add_variable called to create another with "
-           "that name. Variable names must be unique within a Checkpointable "
-           "object.") % (name,))
     with ops.init_scope():
       if context.executing_eagerly():
         # If this is a variable with a single Tensor stored in the checkpoint,
diff --git a/tensorflow/python/training/checkpointable/base_test.py b/tensorflow/python/training/checkpointable/base_test.py
index 950e9c5b535a8314e1068b772f48a14b572df691..fd935ac559ed7cd607145e7b2433a00c1f8431ea 100644
--- a/tensorflow/python/training/checkpointable/base_test.py
+++ b/tensorflow/python/training/checkpointable/base_test.py
@@ -16,8 +16,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import util
 
 
 class InterfaceTests(test.TestCase):
@@ -37,5 +40,22 @@ class InterfaceTests(test.TestCase):
     self.assertIs(duplicate_name_dep, current_dependency)
     self.assertEqual("leaf", current_name)
 
+  def testAddVariableOverwrite(self):
+    root = base.CheckpointableBase()
+    a = root._add_variable_with_custom_getter(
+        name="v", shape=[], getter=variable_scope.get_variable)
+    self.assertEqual([root, a], util.list_objects(root))
+    with ops.Graph().as_default():
+      b = root._add_variable_with_custom_getter(
+          name="v", shape=[], overwrite=True,
+          getter=variable_scope.get_variable)
+      self.assertEqual([root, b], util.list_objects(root))
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError, "already declared as a dependency"):
+        root._add_variable_with_custom_getter(
+            name="v", shape=[], overwrite=False,
+            getter=variable_scope.get_variable)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index 019d43f09c10a4975a9b483593af30b5bbe06089..507cda87349cda25012a0170f230637d0a9758bc 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -57,6 +57,8 @@ def _wrap_or_unwrap(value):
     return value.value
   if isinstance(value, base.CheckpointableBase):
     return value  # Skip conversion for already checkpointable objects.
+  elif isinstance(value, dict):
+    return _DictWrapper(value)
   elif isinstance(value, list):
     return _ListWrapper(value)
   else:
@@ -438,12 +440,15 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
   def __init__(self, *args, **kwargs):
     """Construct a new sequence. Arguments are passed to `dict()`."""
     super(Mapping, self).__init__()
-    self._storage = dict(*args, **kwargs)
+    self._storage = self._make_storage(*args, **kwargs)
     self._storage.update(
         {key: self._track_value(
             value, name=self._name_element(key))
          for key, value in self._storage.items()})
 
+  def _make_storage(self, *args, **kwargs):
+    return dict(*args, **kwargs)
+
   def _name_element(self, key):
     if not isinstance(key, six.string_types):
       raise TypeError(
@@ -476,3 +481,185 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
 
   def __iter__(self):
     return iter(self._storage)
+
+
+# Unlike _ListWrapper, having _DictWrapper inherit from dict and pass isinstance
+# checks seems infeasible. CPython will not call Python methods/properties on
+# dictionary subclasses when running e.g. {}.update(dict_subclass), and instead
+# collects elements directly from dict_subclass's C structs. So subclassing dict
+# implies that the storage has to be "self" (i.e. the C structs for the object
+# must be updated correctly), but we also need that storage to be the wrapped
+# dictionary to avoid synchronization bugs (un-tracked external modifications
+# should still show up when the dict is accessed through the wrapper). Monkey
+# patching all of the "wrapped" dict's methods instead of creating a wrapper
+# object is an option, but not a very attractive one (replacing methods without
+# creating reference cycles is difficult, and then dicts would need to be
+# special cased everywhere as being checkpointable).
+class _DictWrapper(Mapping, collections.MutableMapping):
+  """Wraps built-in dicts to support restore-on-create for variables.
+
+  _DictWrapper is to Mapping as _ListWrapper is to List. Unlike Mapping,
+  _DictWrapper allows non-string keys and values and arbitrary mutations (delete
+  keys, reassign values). Like _ListWrapper, these mutations mean that
+  _DictWrapper will raise an exception on save.
+  """
+
+  def __new__(cls, *args):
+    if len(args) == 1 and isinstance(args[0], dict):
+      return super(_DictWrapper, cls).__new__(cls)
+    else:
+      # Allow construction from a sequence, e.g. for nest.pack_sequence_as. In
+      # this case there's nothing to wrap, so we make a normal dictionary. Also
+      # allows constructing empty instances of the _DictWrapper type, as Session
+      # is wont to do (and again there's nothing to wrap, so a normal dictionary
+      # makes more sense).
+      return dict(*args)
+
+  def __init__(self, wrapped_dict):
+    self._non_string_key = False
+    self._non_append_mutation = False
+    self._external_modification = False
+    super(_DictWrapper, self).__init__(wrapped_dict)
+    self._update_snapshot()
+
+  def _make_storage(self, wrapped_dict):
+    """Re-use the wrapped dict for storage (to force them to be in sync)."""
+    return wrapped_dict
+
+  @property
+  def _checkpoint_dependencies(self):
+    """Check that the object is saveable before listing its dependencies."""
+    self._check_external_modification()
+    if self._non_string_key:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). The wrapped dictionary "
+          "contains a non-string key which maps to a checkpointable object or "
+          "mutable data structure.\n\nIf you don't need this dictionary "
+          "checkpointed, wrap it in a tf.contrib.checkpoint.NoDependency "
+          "object; it will be automatically un-wrapped and subsequently "
+          "ignored." % (self,))
+    if self._non_append_mutation:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). A key mapping to a "
+          "checkpointable object was overwritten or deleted, which would "
+          "cause problems for restoration.\n\nIf you don't need this "
+          "dictionary checkpointed, wrap it in a "
+          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
+          "un-wrapped and subsequently ignored." % (self,))
+    if self._external_modification:
+      raise ValueError(
+          "Unable to save the object %s (a dictionary wrapper constructed "
+          "automatically on attribute assignment). The wrapped dictionary was "
+          "modified outside the wrapper (its final value was %s, its value "
+          "when a checkpoint dependency was added was %s), which breaks "
+          "restoration on object creation.\n\nIf you don't need this "
+          "dictionary checkpointed, wrap it in a "
+          "tf.contrib.checkpoint.NoDependency object; it will be automatically "
+          "un-wrapped and subsequently ignored." % (
+              self, self, self._last_wrapped_dict_snapshot))
+    assert not self._dirty  # Any reason for dirtiness should have an exception.
+    return super(_DictWrapper, self)._checkpoint_dependencies
+
+  @property
+  def _dirty(self):
+    """Check if there has already been a mutation which prevents saving."""
+    return (self._external_modification
+            or self._non_append_mutation
+            or self._non_string_key)
+
+  def _check_external_modification(self):
+    """Checks for any changes to the wrapped dict not through the wrapper."""
+    if self._dirty:
+      return
+    if self != self._last_wrapped_dict_snapshot:
+      self._external_modification = True
+      self._last_wrapped_dict_snapshot = None
+
+  def _update_snapshot(self):
+    """Acknowledges tracked changes to the wrapped dict."""
+    if self._dirty:
+      return
+    self._last_wrapped_dict_snapshot = dict(self)
+
+  def _track_value(self, value, name):
+    """Allows storage of non-checkpointable objects."""
+    if isinstance(name, six.string_types):
+      string_key = True
+    else:
+      name = "-non_string_key"
+      string_key = False
+    try:
+      no_dependency = isinstance(value, NoDependency)
+      value = super(_DictWrapper, self)._track_value(value=value, name=name)
+      if not (string_key or no_dependency):
+        # A non-string key maps to a checkpointable value. This data structure
+        # is not saveable.
+        self._non_string_key = True
+      return value
+    except ValueError:
+      # Even if this value isn't checkpointable, we need to make sure
+      # NoDependency objects get unwrapped.
+      return sticky_attribute_assignment(
+          checkpointable=self, value=value, name=name)
+
+  def _name_element(self, key):
+    """Don't throw errors for non-string keys."""
+    if isinstance(key, six.string_types):
+      return super(_DictWrapper, self)._name_element(key)
+    else:
+      return key
+
+  def __setitem__(self, key, value):
+    """Allow any modifications, but possibly mark the wrapper as unsaveable."""
+    self._check_external_modification()
+    no_dep = isinstance(value, NoDependency)
+    if isinstance(key, six.string_types):
+      existing_dependency = self._lookup_dependency(key)
+      value = self._track_value(value, name=key)
+    else:
+      value = _wrap_or_unwrap(value)
+      existing_dependency = None
+      if not no_dep and isinstance(value, base.CheckpointableBase):
+        # Non-string keys are OK as long as we have no reason to add a
+        # dependency on the value (either because the value is not
+        # checkpointable, or because it was wrapped in a NoDependency object).
+        self._non_string_key = True
+    current_value = self._storage.setdefault(key, value)
+    if current_value is not value:
+      if ((not no_dep and isinstance(value, base.CheckpointableBase))
+          # We don't want to just check that the existing object is
+          # checkpointable, since it may have been wrapped in a NoDependency
+          # object.
+          or existing_dependency is not None):
+        # A checkpointable object was replaced under the same key; this means
+        # that restoring would be error-prone, so we'll throw an exception on
+        # save.
+        self._non_append_mutation = True
+      self._storage[key] = value
+
+    self._update_snapshot()
+
+  def __delitem__(self, key):
+    self._check_external_modification()
+    existing_value = self[key]
+    if isinstance(existing_value, base.CheckpointableBase):
+      # Deleting tracked checkpointable values means restoring is problematic,
+      # so we'll throw an exception on save.
+      self._non_append_mutation = True
+    del self._storage[key]
+    self._update_snapshot()
+
+  def __repr__(self):
+    return "DictWrapper(%s)" % (repr(self._storage),)
+
+  def __hash__(self):
+    raise TypeError("unhashable type: 'DictWrapper'")
+
+  def __eq__(self, other):
+    return self._storage == getattr(other, "_storage", other)
+
+  def update(self, *args, **kwargs):
+    for key, value in dict(*args, **kwargs).items():
+      self[key] = value
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 7bee00a9278acfe967f8ce09cfbd4b36e0117828..472b7c32b48ac02d6719a59a6020e97ff9c46cc2 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.checkpointable import util
 
 
 class HasList(training.Model):
@@ -303,6 +304,124 @@ class MappingTests(test.TestCase):
                         data_structures.Mapping()])
     self.assertEqual(2, len(has_mappings))
     self.assertNotIn(data_structures.Mapping(), has_mappings)
+    # In contrast to Mapping, dict wrappers are not hashable
+    a = tracking.Checkpointable()
+    a.d = {}
+    self.assertEqual({}, a.d)
+    self.assertFalse({} != a.d)  # pylint: disable=g-explicit-bool-comparison
+    self.assertNotEqual({1: 2}, a.d)
+    with self.assertRaisesRegexp(TypeError, "unhashable"):
+      set([a.d])
+
+  def testDictWrapperBadKeys(self):
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d[1] = data_structures.List()
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "non-string key"):
+      model.save_weights(save_path)
+
+  def testDictWrapperNoDependency(self):
+    a = tracking.Checkpointable()
+    a.d = data_structures.NoDependency({})
+    a.d[1] = [3]
+    self.assertEqual([a], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonStringKeyNotCheckpointableValue(self):
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = data_structures.NoDependency([3])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testNonAppendNotCheckpointable(self):
+    # Non-append mutations (deleting or overwriting values) are OK when the
+    # values aren't tracked.
+    a = tracking.Checkpointable()
+    a.d = {}
+    a.d["a"] = [3]
+    a.d[1] = 3
+    a.d[1] = 2
+    self.assertEqual(2, a.d[1])
+    del a.d[1]
+    a.d[2] = data_structures.NoDependency(tracking.Checkpointable())
+    second = tracking.Checkpointable()
+    a.d[2] = data_structures.NoDependency(second)
+    self.assertIs(second, a.d[2])
+    self.assertEqual([a, a.d, a.d["a"]], util.list_objects(a))
+    model = training.Model()
+    model.sub = a
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    model.load_weights(save_path)
+
+  def testDelNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = []
+    del model.d["a"]
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testPopNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = []
+    model.d.pop("a")
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testExternalModificationNoSave(self):
+    model = training.Model()
+    external_reference = {}
+    model.d = external_reference
+    external_reference["a"] = []
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "modified outside the wrapper"):
+      model.save_weights(save_path)
+
+  def testOverwriteNoSave(self):
+    model = training.Model()
+    model.d = {}
+    model.d["a"] = {}
+    model.d["a"] = {}
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    with self.assertRaisesRegexp(ValueError, "overwritten or deleted"):
+      model.save_weights(save_path)
+
+  def testIter(self):
+    model = training.Model()
+    model.d = {1: 3}
+    model.d[1] = 3
+    self.assertEqual([1], list(model.d))
+    new_dict = {}
+    # This update() is super tricky. If the dict wrapper subclasses dict,
+    # CPython will access its storage directly instead of calling any
+    # methods/properties on the object. So the options are either not to
+    # subclass dict (in which case update will call normal iter methods, but the
+    # object won't pass isinstance checks) or to subclass dict and keep that
+    # storage updated (no shadowing all its methods like _ListWrapper).
+    new_dict.update(model.d)
+    self.assertEqual({1: 3}, new_dict)
+
+  def testConstructableFromSequence(self):
+    result = data_structures._DictWrapper([(1, 2), (3, 4)])
+    self.assertIsInstance(result, dict)
+    self.assertEqual({1: 2, 3: 4}, result)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/tracking_test.py b/tensorflow/python/training/checkpointable/tracking_test.py
index 96da0d6e4720b44815de137c0efdd74645bae0fc..f8d17cd417e4e81fd1e37d21a0a7de1d8ef8d3c4 100644
--- a/tensorflow/python/training/checkpointable/tracking_test.py
+++ b/tensorflow/python/training/checkpointable/tracking_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 import os
 
 import numpy
+import six
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
@@ -143,6 +144,29 @@ class InterfaceTests(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "A list element was replaced"):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testDictionariesBasic(self):
+    a = training.Model()
+    b = training.Model()
+    a.attribute = {"b": b}
+    c = training.Model()
+    a.attribute["c"] = []
+    a.attribute["c"].append(c)
+    a_deps = util.list_objects(a)
+    self.assertIn(b, a_deps)
+    self.assertIn(c, a_deps)
+    self.assertIs(b, a.attribute["b"])
+    six.assertCountEqual(
+        self,
+        ["b", "c"],
+        [dep.name for dep in a.attribute._checkpoint_dependencies])
+    self.assertEqual([b, c], a.layers)
+    self.assertEqual([b, c], a.attribute.layers)
+    self.assertEqual([c], a.attribute["c"].layers)
+    checkpoint = util.Checkpoint(a=a)
+    save_path = checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
+    checkpoint.restore(save_path).assert_consumed()
+
   @test_util.run_in_graph_and_eager_modes
   def testNoDepList(self):
     a = training.Model()
@@ -159,12 +183,13 @@ class InterfaceTests(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testAssertions(self):
     a = tracking.Checkpointable()
-    a.l = [numpy.zeros([2, 2])]
-    self.assertAllEqual([numpy.zeros([2, 2])], a.l)
-    self.assertAllClose([numpy.zeros([2, 2])], a.l)
-    nest.map_structure(self.assertAllClose, a.l, [numpy.zeros([2, 2])])
-    a.tensors = [array_ops.ones([2, 2]), array_ops.zeros([3, 3])]
-    self.assertAllClose([numpy.ones([2, 2]), numpy.zeros([3, 3])],
+    a.l = {"k": [numpy.zeros([2, 2])]}
+    self.assertAllEqual(nest.flatten({"k": [numpy.zeros([2, 2])]}),
+                        nest.flatten(a.l))
+    self.assertAllClose({"k": [numpy.zeros([2, 2])]}, a.l)
+    nest.map_structure(self.assertAllClose, a.l, {"k": [numpy.zeros([2, 2])]})
+    a.tensors = {"k": [array_ops.ones([2, 2]), array_ops.zeros([3, 3])]}
+    self.assertAllClose({"k": [numpy.ones([2, 2]), numpy.zeros([3, 3])]},
                         self.evaluate(a.tensors))
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index 6ae5765b133cc72b67f3d9864d0f67abf33f0648..664b2348c0e44303ea8e297c462383da3e8cf3db 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -101,6 +101,7 @@ class _CheckpointRestoreCoordinator(object):
     # this checkpoint.
     self.restore_ops = []
     self.restore_ops_by_name = {}
+    self.new_restore_ops_callback = None
     # A mapping from optimizer proto ids to lists of slot variables to be
     # restored when the optimizer is tracked. Only includes slot variables whose
     # regular variables have already been created, and only for optimizer
@@ -121,6 +122,11 @@ class _CheckpointRestoreCoordinator(object):
                     slot_variable_id=slot_reference.slot_variable_node_id,
                     slot_name=slot_reference.slot_name))
 
+  def new_restore_ops(self, new_ops):
+    self.restore_ops.extend(new_ops)
+    if self.new_restore_ops_callback:
+      self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -361,24 +367,42 @@ class _ObjectIdentityWeakKeyDictionary(_ObjectIdentityDictionary):
         yield unwrapped
 
 
-class _ObjectIdentityWeakSet(collections.MutableSet):
-  """Like weakref.WeakSet, but compares objects with "is"."""
+class _ObjectIdentitySet(collections.MutableSet):
+  """Like the built-in set, but compares objects with "is"."""
 
-  def __init__(self):
-    self._storage = set()
+  def __init__(self, *args):
+    self._storage = set([self._wrap_key(obj) for obj in list(*args)])
+
+  def _wrap_key(self, key):
+    return _ObjectIdentityWrapper(key)
 
   def __contains__(self, key):
-    return _WeakObjectIdentityWrapper(key) in self._storage
+    return self._wrap_key(key) in self._storage
 
   def discard(self, key):
-    self._storage.discard(_WeakObjectIdentityWrapper(key))
+    self._storage.discard(self._wrap_key(key))
 
   def add(self, key):
-    self._storage.add(_WeakObjectIdentityWrapper(key))
+    self._storage.add(self._wrap_key(key))
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __iter__(self):
+    keys = list(self._storage)
+    for key in keys:
+      yield key.unwrapped
+
+
+class _ObjectIdentityWeakSet(_ObjectIdentitySet):
+  """Like weakref.WeakSet, but compares objects with "is"."""
+
+  def _wrap_key(self, key):
+    return _WeakObjectIdentityWrapper(key)
 
   def __len__(self):
     # Iterate, discarding old weak refs
-    return len(list(self))
+    return len([_ for _ in self])
 
   def __iter__(self):
     keys = list(self._storage)
@@ -747,7 +771,7 @@ def capture_dependencies(template):
           initial_value=initializer,
           name=name,
           **inner_kwargs)
-    if name.startswith(name_prefix):
+    if name is not None and name.startswith(name_prefix):
       scope_stripped_name = name[len(name_prefix) + 1:]
       if not checkpointable_parent:
         return template._add_variable_with_custom_getter(  # pylint: disable=protected-access
@@ -803,6 +827,31 @@ class _LoadStatus(object):
     pass
 
 
+def streaming_restore(status, session=None):
+  """When graph building, runs restore ops as soon as they come in.
+
+  Args:
+    status: A _LoadStatus objects from an object-based saver's
+      restore(). Streaming restore from name-based checkpoints is not currently
+      supported.
+    session: A session to run new restore ops in.
+  """
+  if context.executing_eagerly():
+    # Streaming restore is the default/only behavior when executing eagerly.
+    return
+  if session is None:
+    session = ops.get_default_session()
+  if isinstance(status, NameBasedSaverStatus):
+    raise NotImplementedError(
+        "Streaming restore not supported from name-based checkpoints. File a "
+        "feature request if this limitation bothers you.")
+  status.run_restore_ops(session=session)
+  # pylint: disable=protected-access
+  status._checkpoint.new_restore_ops_callback = (
+      lambda ops: session.run(ops, feed_dict=status._feed_dict))
+  # pylint: enable=protected-access
+
+
 class CheckpointLoadStatus(_LoadStatus):
   """Checks the status of checkpoint loading and manages restore ops.
 
@@ -857,8 +906,8 @@ class CheckpointLoadStatus(_LoadStatus):
     for checkpointable_object in list_objects(self._root_checkpointable):
       self._checkpoint.all_python_objects.add(checkpointable_object)
     unused_python_objects = (
-        set(self._checkpoint.all_python_objects)
-        - set(self._checkpoint.object_by_proto_id.values()))
+        _ObjectIdentitySet(self._checkpoint.all_python_objects)
+        - _ObjectIdentitySet(self._checkpoint.object_by_proto_id.values()))
     if unused_python_objects:
       raise AssertionError(
           ("Some Python objects were not bound to checkpointed values, likely "
@@ -974,11 +1023,13 @@ _DEPRECATED_RESTORE_INSTRUCTIONS = (
     "one this message is coming from) and use that checkpoint in the future.")
 
 
-@deprecation.deprecated(
-    date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
 class NameBasedSaverStatus(_LoadStatus):
   """Status for loading a name-based training checkpoint."""
 
+  # Ideally this deprecation decorator would be on the class, but that
+  # interferes with isinstance checks.
+  @deprecation.deprecated(
+      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
   def __init__(self, checkpoint, root_checkpointable):
     self._checkpoint = checkpoint
     self._root_checkpointable = root_checkpointable
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 51190264e81ad177c56a6864b616aee52d954c43..fd195a7965ab7512728e4e9e9e0c51a00b6ad79d 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -356,7 +356,15 @@ def natural_exp_decay(learning_rate,
   The function returns the decayed learning rate.  It is computed as:
 
   ```python
-  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
+  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step /
+  decay_step)
+  ```
+
+  or, if `staircase` is `True`, as:
+
+  ```python
+  decayed_learning_rate = learning_rate * exp(-decay_rate * floor(global_step /
+  decay_step))
   ```
 
   Example: decay exponentially with a base of 0.96:
@@ -365,8 +373,10 @@ def natural_exp_decay(learning_rate,
   ...
   global_step = tf.Variable(0, trainable=False)
   learning_rate = 0.1
+  decay_steps = 5
   k = 0.5
-  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
+  learning_rate = tf.train.natural_exp_decay(learning_rate, global_step,
+                                             decay_steps, k)
 
   # Passing global_step to minimize() will increment it at each step.
   learning_step = (
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 1ee975fbe48e8ba724d8f40040b122c5c02aa352..c80cdf03be43f2af8b0247109dc52af3e95c8318 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -126,8 +126,10 @@ class BaseSaverBuilder(object):
           def f():
             with ops.device(v.device):
               x = v.read_value()
-            with ops.device("/device:CPU:0"):
-              return array_ops.identity(x)
+              # To allow variables placed on non-CPU devices to be checkpointed,
+              # we copy them to CPU on the same machine first.
+              with ops.device("/device:CPU:0"):
+                return array_ops.identity(x)
           return f
 
         self.handle_op = var.handle
@@ -1923,6 +1925,14 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   execution is enabled.
   @end_compatibility
   """  # pylint: disable=g-doc-exception
+  return _import_meta_graph_with_return_elements(
+      meta_graph_or_file, clear_devices, import_scope, **kwargs)[0]
+
+
+def _import_meta_graph_with_return_elements(
+    meta_graph_or_file, clear_devices=False, import_scope=None,
+    return_elements=None, **kwargs):
+  """Import MetaGraph, and return both a saver and returned elements."""
   if context.executing_eagerly():
     raise RuntimeError("Exporting/importing meta graphs is not supported when "
                        "eager execution is enabled. No graph exists when eager "
@@ -1932,12 +1942,22 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   else:
     meta_graph_def = meta_graph_or_file
 
-  imported_vars = meta_graph.import_scoped_meta_graph(
-      meta_graph_def,
-      clear_devices=clear_devices,
-      import_scope=import_scope,
-      **kwargs)
+  imported_vars, imported_return_elements = (
+      meta_graph.import_scoped_meta_graph_with_return_elements(
+          meta_graph_def,
+          clear_devices=clear_devices,
+          import_scope=import_scope,
+          return_elements=return_elements,
+          **kwargs))
+
+  saver = _create_saver_from_imported_meta_graph(
+      meta_graph_def, import_scope, imported_vars)
+  return saver, imported_return_elements
+
 
+def _create_saver_from_imported_meta_graph(
+    meta_graph_def, import_scope, imported_vars):
+  """Return a saver for restoring variable values to an imported MetaGraph."""
   if meta_graph_def.HasField("saver_def"):
     # Infer the scope that is prepended by `import_scoped_meta_graph`.
     scope = import_scope
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 8943173cda4978f7e6bc6f1fec2c2253476fb788..e40e5c39cc39af19bf969d986eefa140fc32eb42 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -174,6 +174,24 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  def testResourceColocation(self):
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
+    with ops_lib.device("/job:ps/device:GPU:0"):
+      v = variable_scope.get_variable("v0",
+                                      shape=[10, 2],
+                                      partitioner=partitioner,
+                                      use_resource=True)
+    saver_module.Saver({"v0": v}).build()
+    save_op = None
+    for op in ops_lib.get_default_graph().get_operations():
+      if op.type == "SaveV2":
+        save_op = op
+        break
+    assert save_op is not None
+    for save_inp in save_op.inputs[3:]:
+      # Input to SaveV2 op is placed on CPU of the same device as the Variable.
+      self.assertEqual("/job:ps/device:CPU:0", save_inp.device)
+
   def testResourceVariableReadOpsAddedDeterministically(self):
     graph_defs = []
     num_graphs = 10
@@ -793,6 +811,37 @@ class SaverTest(test.TestCase):
                                      "original dtype"):
           save.restore(sess, save_path)
 
+  # Test restoring large tensors (triggers a thread pool)
+  def testRestoreLargeTensors(self):
+    save_dir = self.get_temp_dir()
+    def _model():
+      small_v = [variable_scope.get_variable(
+          "small%d" % i, shape=[10, 2], use_resource=True) for i in range(5)]
+      large_v = [variable_scope.get_variable(
+          "large%d" % i, shape=[32000, 1000], use_resource=True)
+                 for i in range(3)]
+      return small_v + large_v
+
+    save_graph = ops_lib.Graph()
+    with save_graph.as_default(), self.test_session(graph=save_graph) as sess:
+      orig_vars = _model()
+      sess.run(variables.global_variables_initializer())
+      save = saver_module.Saver(max_to_keep=1)
+      variables.global_variables_initializer().run()
+      save.save(sess, save_dir)
+      orig_vals = sess.run(orig_vars)
+
+    restore_graph = ops_lib.Graph()
+    with restore_graph.as_default(), self.test_session(
+        graph=restore_graph) as sess:
+      restored_vars = _model()
+      save = saver_module.Saver(max_to_keep=1)
+      save.restore(sess, save_dir)
+      restored_vals = sess.run(restored_vars)
+
+    for orig, restored in zip(orig_vals, restored_vals):
+      self.assertAllEqual(orig, restored)
+
 
 class SaveRestoreShardedTest(test.TestCase):
 
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
index 7bbbde3cd288a7373c1ac845977a4d92d2a1b7c0..4e9b07e20ac7ef176316d3532958c84754628e56 100644
--- a/tensorflow/python/util/function_utils.py
+++ b/tensorflow/python/util/function_utils.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import functools
 
+import six
+
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
@@ -55,3 +57,36 @@ def fn_args(fn):
     if _is_bounded_method(fn):
       args.remove('self')
   return tuple(args)
+
+
+def get_func_name(func):
+  """Returns name of passed callable."""
+  _, func = tf_decorator.unwrap(func)
+  if callable(func):
+    if tf_inspect.isfunction(func):
+      return func.__name__
+    elif tf_inspect.ismethod(func):
+      return '%s.%s' % (six.get_method_self(func).__class__.__name__,
+                        six.get_method_function(func).__name__)
+    else:  # Probably a class instance with __call__
+      return str(type(func))
+  else:
+    raise ValueError('Argument must be callable')
+
+
+def get_func_code(func):
+  """Returns func_code of passed callable, or None if not available."""
+  _, func = tf_decorator.unwrap(func)
+  if callable(func):
+    if tf_inspect.isfunction(func) or tf_inspect.ismethod(func):
+      return six.get_function_code(func)
+    # Since the object is not a function or method, but is a callable, we will
+    # try to access the __call__method as a function.  This works with callable
+    # classes but fails with functool.partial objects despite their __call__
+    # attribute.
+    try:
+      return six.get_function_code(func.__call__)
+    except AttributeError:
+      return None
+  else:
+    raise ValueError('Argument must be callable')
diff --git a/tensorflow/python/util/function_utils_test.py b/tensorflow/python/util/function_utils_test.py
index e78cf6a5b02af317b08ff3a833f7b73b062f106e..1588328c262982e5b71446e499c8d0217c28c0a5 100644
--- a/tensorflow/python/util/function_utils_test.py
+++ b/tensorflow/python/util/function_utils_test.py
@@ -24,6 +24,16 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import function_utils
 
 
+def silly_example_function():
+  pass
+
+
+class SillyCallableClass(object):
+
+  def __call__(self):
+    pass
+
+
 class FnArgsTest(test.TestCase):
 
   def test_simple_function(self):
@@ -124,5 +134,73 @@ class FnArgsTest(test.TestCase):
     self.assertEqual(3, double_wrapped_fn(3))
     self.assertEqual(3, double_wrapped_fn(a=3))
 
+
+class GetFuncNameTest(test.TestCase):
+
+  def testWithSimpleFunction(self):
+    self.assertEqual(
+        'silly_example_function',
+        function_utils.get_func_name(silly_example_function))
+
+  def testWithClassMethod(self):
+    self.assertEqual(
+        'GetFuncNameTest.testWithClassMethod',
+        function_utils.get_func_name(self.testWithClassMethod))
+
+  def testWithCallableClass(self):
+    callable_instance = SillyCallableClass()
+    self.assertRegexpMatches(
+        function_utils.get_func_name(callable_instance),
+        '<.*SillyCallableClass.*>')
+
+  def testWithFunctoolsPartial(self):
+    partial = functools.partial(silly_example_function)
+    self.assertRegexpMatches(
+        function_utils.get_func_name(partial),
+        '<.*functools.partial.*>')
+
+  def testWithLambda(self):
+    anon_fn = lambda x: x
+    self.assertEqual('<lambda>', function_utils.get_func_name(anon_fn))
+
+  def testRaisesWithNonCallableObject(self):
+    with self.assertRaises(ValueError):
+      function_utils.get_func_name(None)
+
+
+class GetFuncCodeTest(test.TestCase):
+
+  def testWithSimpleFunction(self):
+    code = function_utils.get_func_code(silly_example_function)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithClassMethod(self):
+    code = function_utils.get_func_code(self.testWithClassMethod)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithCallableClass(self):
+    callable_instance = SillyCallableClass()
+    code = function_utils.get_func_code(callable_instance)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithLambda(self):
+    anon_fn = lambda x: x
+    code = function_utils.get_func_code(anon_fn)
+    self.assertIsNotNone(code)
+    self.assertRegexpMatches(code.co_filename, 'function_utils_test.py')
+
+  def testWithFunctoolsPartial(self):
+    partial = functools.partial(silly_example_function)
+    code = function_utils.get_func_code(partial)
+    self.assertIsNone(code)
+
+  def testRaisesWithNonCallableObject(self):
+    with self.assertRaises(ValueError):
+      function_utils.get_func_code(None)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index d63f59a8c8e836d3f8ad3686da0b0b3f010a9225..5aac559b9b017aa236dcb6c20538d90daef903be 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -73,7 +73,7 @@ def _sequence_like(instance, args):
   Returns:
     `args` with the type of `instance`.
   """
-  if isinstance(instance, dict):
+  if isinstance(instance, (dict, _collections.Mapping)):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -89,7 +89,7 @@ def _sequence_like(instance, args):
 
 
 def _yield_value(iterable):
-  if isinstance(iterable, dict):
+  if isinstance(iterable, (dict, _collections.Mapping)):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -215,7 +215,7 @@ def flatten_dict_items(dictionary):
     ValueError: If any key and value have not the same structure, or if keys are
       not unique.
   """
-  if not isinstance(dictionary, dict):
+  if not isinstance(dictionary, (dict, _collections.Mapping)):
     raise TypeError("input must be a dictionary")
   flat_dictionary = {}
   for i, v in _six.iteritems(dictionary):
@@ -455,7 +455,7 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "structure has length %s, while shallow structure has length %s."
           % (len(input_tree), len(shallow_tree)))
 
-    if check_types and isinstance(shallow_tree, dict):
+    if check_types and isinstance(shallow_tree, (dict, _collections.Mapping)):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
             "The two structures don't have the same keys. Input "
@@ -716,7 +716,7 @@ def yield_flat_paths(nest):
 
   # The _maybe_add_final_path_element function is used below in order to avoid
   # adding trailing slashes when the sub-element recursed into is a leaf.
-  if isinstance(nest, dict):
+  if isinstance(nest, (dict, _collections.Mapping)):
     for key in _sorted(nest):
       value = nest[key]
       for sub_path in yield_flat_paths(value):
@@ -760,3 +760,4 @@ def flatten_with_joined_string_paths(structure, separator="/"):
 
 
 _pywrap_tensorflow.RegisterSequenceClass(_collections.Sequence)
+_pywrap_tensorflow.RegisterMappingClass(_collections.Mapping)
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 2f12b25354a905b2aafa870c28f1e9c0b693e888..26c6ea4b012e1d0577b63144145625d9b03bc54b 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import time
 
+from absl.testing import parameterized
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -33,7 +34,22 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
-class NestTest(test.TestCase):
+class _CustomMapping(collections.Mapping):
+
+  def __init__(self, *args, **kwargs):
+    self._wrapped = dict(*args, **kwargs)
+
+  def __getitem__(self, key):
+    return self._wrapped[key]
+
+  def __iter__(self):
+    return iter(self._wrapped)
+
+  def __len__(self):
+    return len(self._wrapped)
+
+
+class NestTest(parameterized.TestCase, test.TestCase):
 
   PointXY = collections.namedtuple("Point", ["x", "y"])  # pylint: disable=invalid-name
 
@@ -72,26 +88,32 @@ class NestTest(test.TestCase):
     with self.assertRaises(ValueError):
       nest.pack_sequence_as([5, 6, [7, 8]], ["a", "b", "c"])
 
+  @parameterized.parameters({"mapping_type": collections.OrderedDict},
+                            {"mapping_type": _CustomMapping})
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testFlattenDictOrder(self):
+  def testFlattenDictOrder(self, mapping_type):
     """`flatten` orders dicts by key, including OrderedDicts."""
-    ordered = collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
+    ordered = mapping_type([("d", 3), ("b", 1), ("a", 0), ("c", 2)])
     plain = {"d": 3, "b": 1, "a": 0, "c": 2}
     ordered_flat = nest.flatten(ordered)
     plain_flat = nest.flatten(plain)
     self.assertEqual([0, 1, 2, 3], ordered_flat)
     self.assertEqual([0, 1, 2, 3], plain_flat)
 
-  def testPackDictOrder(self):
+  @parameterized.parameters({"mapping_type": collections.OrderedDict},
+                            {"mapping_type": _CustomMapping})
+  def testPackDictOrder(self, mapping_type):
     """Packing orders dicts by key, including OrderedDicts."""
-    ordered = collections.OrderedDict([("d", 0), ("b", 0), ("a", 0), ("c", 0)])
+    custom = mapping_type([("d", 0), ("b", 0), ("a", 0), ("c", 0)])
     plain = {"d": 0, "b": 0, "a": 0, "c": 0}
     seq = [0, 1, 2, 3]
-    ordered_reconstruction = nest.pack_sequence_as(ordered, seq)
+    custom_reconstruction = nest.pack_sequence_as(custom, seq)
     plain_reconstruction = nest.pack_sequence_as(plain, seq)
+    self.assertIsInstance(custom_reconstruction, mapping_type)
+    self.assertIsInstance(plain_reconstruction, dict)
     self.assertEqual(
-        collections.OrderedDict([("d", 3), ("b", 1), ("a", 0), ("c", 2)]),
-        ordered_reconstruction)
+        mapping_type([("d", 3), ("b", 1), ("a", 0), ("c", 2)]),
+        custom_reconstruction)
     self.assertEqual({"d": 3, "b": 1, "a": 0, "c": 2}, plain_reconstruction)
 
   Abc = collections.namedtuple("A", ("b", "c"))  # pylint: disable=invalid-name
@@ -101,8 +123,10 @@ class NestTest(test.TestCase):
     # A nice messy mix of tuples, lists, dicts, and `OrderedDict`s.
     mess = [
         "z",
-        NestTest.Abc(3, 4),
-        {
+        NestTest.Abc(3, 4), {
+            "d": _CustomMapping({
+                41: 4
+            }),
             "c": [
                 1,
                 collections.OrderedDict([
@@ -111,17 +135,19 @@ class NestTest(test.TestCase):
                 ]),
             ],
             "b": 5
-        },
-        17
+        }, 17
     ]
 
     flattened = nest.flatten(mess)
-    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 17])
+    self.assertEqual(flattened, ["z", 3, 4, 5, 1, 2, 3, 4, 17])
 
     structure_of_mess = [
         14,
         NestTest.Abc("a", True),
         {
+            "d": _CustomMapping({
+                41: 42
+            }),
             "c": [
                 0,
                 collections.OrderedDict([
@@ -142,6 +168,10 @@ class NestTest(test.TestCase):
     self.assertIsInstance(unflattened_ordered_dict, collections.OrderedDict)
     self.assertEqual(list(unflattened_ordered_dict.keys()), ["b", "a"])
 
+    unflattened_custom_mapping = unflattened[2]["d"]
+    self.assertIsInstance(unflattened_custom_mapping, _CustomMapping)
+    self.assertEqual(list(unflattened_custom_mapping.keys()), [41])
+
   def testFlatten_numpyIsNotFlattened(self):
     structure = np.array([1, 2, 3])
     flattened = nest.flatten(structure)
@@ -179,19 +209,23 @@ class NestTest(test.TestCase):
     self.assertFalse(nest.is_sequence(math_ops.tanh(ones)))
     self.assertFalse(nest.is_sequence(np.ones((4, 5))))
 
-  def testFlattenDictItems(self):
-    dictionary = {(4, 5, (6, 8)): ("a", "b", ("c", "d"))}
+  @parameterized.parameters({"mapping_type": _CustomMapping},
+                            {"mapping_type": dict})
+  def testFlattenDictItems(self, mapping_type):
+    dictionary = mapping_type({(4, 5, (6, 8)): ("a", "b", ("c", "d"))})
     flat = {4: "a", 5: "b", 6: "c", 8: "d"}
     self.assertEqual(nest.flatten_dict_items(dictionary), flat)
 
     with self.assertRaises(TypeError):
       nest.flatten_dict_items(4)
 
-    bad_dictionary = {(4, 5, (4, 8)): ("a", "b", ("c", "d"))}
+    bad_dictionary = mapping_type({(4, 5, (4, 8)): ("a", "b", ("c", "d"))})
     with self.assertRaisesRegexp(ValueError, "not unique"):
       nest.flatten_dict_items(bad_dictionary)
 
-    another_bad_dictionary = {(4, 5, (6, 8)): ("a", "b", ("c", ("d", "e")))}
+    another_bad_dictionary = mapping_type({
+        (4, 5, (6, 8)): ("a", "b", ("c", ("d", "e")))
+    })
     with self.assertRaisesRegexp(
         ValueError, "Key had [0-9]* elements, but value had [0-9]* elements"):
       nest.flatten_dict_items(another_bad_dictionary)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 366f8a0deb533c3ee258ea618136d44a28160f8f..ad85a44f8d3c634f943f1ca0f0a96c2c3202e704 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -31,6 +31,8 @@ namespace {
 
 // Type object for collections.Sequence. This is set by RegisterSequenceClass.
 PyObject* CollectionsSequenceType = nullptr;
+// Type object for collections.Mapping, set by RegisterMappingClass.
+PyObject* CollectionsMappingType = nullptr;
 PyTypeObject* SparseTensorValueType = nullptr;
 
 const int kMaxItemsInCache = 1024;
@@ -45,6 +47,23 @@ bool IsString(PyObject* o) {
          PyUnicode_Check(o);
 }
 
+// Work around a writable-strings warning with Python 2's PyMapping_Keys macro,
+// and while we're at it give them consistent behavior by making sure the
+// returned value is a list.
+//
+// As with PyMapping_Keys, returns a new reference.
+PyObject* MappingKeys(PyObject* o) {
+#if PY_MAJOR_VERSION >= 3
+  return PyMapping_Keys(o);
+#else
+  static char key_method_name[] = "keys";
+  Safe_PyObjectPtr raw_result(PyObject_CallMethod(o, key_method_name, nullptr));
+  return PySequence_Fast(
+      raw_result.get(),
+      "The '.keys()' method of a custom mapping returned a non-sequence.");
+#endif
+}
+
 // Equivalent to Python's 'o.__class__.__name__'
 // Note that '__class__' attribute is set only in new-style classes.
 // A lot of tensorflow code uses __class__ without checks, so it seems like
@@ -85,6 +104,119 @@ string PyObjectToString(PyObject* o) {
   }
 }
 
+class CachedTypeCheck {
+ public:
+  explicit CachedTypeCheck(std::function<int(PyObject*)> ternary_predicate)
+      : ternary_predicate_(std::move(ternary_predicate)) {}
+
+  ~CachedTypeCheck() {
+    mutex_lock l(type_to_sequence_map_mu_);
+    for (const auto& pair : type_to_sequence_map_) {
+      Py_DECREF(pair.first);
+    }
+  }
+
+  // Caches successful executions of the one-argument (PyObject*) callable
+  // "ternary_predicate" based on the type of "o". -1 from the callable
+  // indicates an unsuccessful check (not cached), 0 indicates that "o"'s type
+  // does not match the predicate, and 1 indicates that it does. Used to avoid
+  // calling back into Python for expensive isinstance checks.
+  int CachedLookup(PyObject* o) {
+    // Try not to return to Python - see if the type has already been seen
+    // before.
+
+    auto* type = Py_TYPE(o);
+
+    {
+      mutex_lock l(type_to_sequence_map_mu_);
+      auto it = type_to_sequence_map_.find(type);
+      if (it != type_to_sequence_map_.end()) {
+        return it->second;
+      }
+    }
+
+    int check_result = ternary_predicate_(o);
+
+    if (check_result == -1) {
+      return -1;  // Type check error, not cached.
+    }
+
+    // NOTE: This is never decref'd as long as the object lives, which is likely
+    // forever, but we don't want the type to get deleted as long as it is in
+    // the map. This should not be too much of a leak, as there should only be a
+    // relatively small number of types in the map, and an even smaller number
+    // that are eligible for decref. As a precaution, we limit the size of the
+    // map to 1024.
+    {
+      mutex_lock l(type_to_sequence_map_mu_);
+      if (type_to_sequence_map_.size() < kMaxItemsInCache) {
+        Py_INCREF(type);
+        type_to_sequence_map_.insert({type, check_result});
+      }
+    }
+
+    return check_result;
+  }
+
+ private:
+  std::function<int(PyObject*)> ternary_predicate_;
+  mutex type_to_sequence_map_mu_;
+  std::unordered_map<PyTypeObject*, bool> type_to_sequence_map_
+      GUARDED_BY(type_to_sequence_map_mu_);
+};
+
+// Returns 1 if `o` is considered a mapping for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsMappingHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    return PyObject_IsInstance(to_check, CollectionsMappingType);
+  });
+  if (PyDict_Check(o)) return true;
+  if (TF_PREDICT_FALSE(CollectionsMappingType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Mapping type has not been set. "
+            "Please call RegisterMappingClass before using this module")
+            .c_str());
+    return -1;
+  }
+  return check_cache->CachedLookup(o);
+}
+
+// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
+// Returns 0 otherwise.
+// Returns -1 if an error occurred.
+int IsSequenceHelper(PyObject* o) {
+  static auto* const check_cache = new CachedTypeCheck([](PyObject* to_check) {
+    int is_instance = PyObject_IsInstance(to_check, CollectionsSequenceType);
+
+    // Don't cache a failed is_instance check.
+    if (is_instance == -1) return -1;
+
+    return static_cast<int>(is_instance != 0 && !IsString(to_check));
+  });
+  // We treat dicts and other mappings as special cases of sequences.
+  if (IsMappingHelper(o)) return true;
+  if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
+    LOG(WARNING) << "Sets are not currently considered sequences, "
+                    "but this may change in the future, "
+                    "so consider avoiding using them.";
+    WarnedThatSetIsNotSequence = true;
+  }
+  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat(
+            "collections.Sequence type has not been set. "
+            "Please call RegisterSequenceClass before using this module")
+            .c_str());
+    return -1;
+  }
+  return check_cache->CachedLookup(o);
+}
+
 // Implements the same idea as tensorflow.util.nest._yield_value
 // During construction we check if the iterable is a dictionary.
 // If so, we construct a sequence from its sorted keys that will be used
@@ -96,7 +228,12 @@ string PyObjectToString(PyObject* o) {
 // 'iterable' must not be modified while ValIterator is used.
 class ValIterator {
  public:
-  explicit ValIterator(PyObject* iterable) : dict_(nullptr), index_(0) {
+  explicit ValIterator(PyObject* iterable)
+      : dict_(nullptr),
+        mapping_(nullptr),
+        last_mapping_element_(nullptr),
+        seq_(nullptr),
+        index_(0) {
     if (PyDict_Check(iterable)) {
       dict_ = iterable;
       // PyDict_Keys returns a list, which can be used with
@@ -108,6 +245,10 @@ class ValIterator {
       // bugs caused by mixing ordered and plain dicts (e.g., flattening
       // a dict but using a corresponding `OrderedDict` to pack it back).
       PyList_Sort(seq_);
+    } else if (IsMappingHelper(iterable)) {
+      mapping_ = iterable;
+      seq_ = MappingKeys(iterable);
+      PyList_Sort(seq_);
     } else {
       seq_ = PySequence_Fast(iterable, "");
     }
@@ -122,7 +263,9 @@ class ValIterator {
     PyObject* element = nullptr;
     if (index_ < size_) {
       // Both PySequence_Fast_GET_ITEM and PyDict_GetItem return borrowed
-      // references.
+      // references. For general mappings, ValIterator keeps a reference to the
+      // last retrieved element (and decrefs it before producing the next
+      // element) to abstract away the borrowed/new difference.
       element = PySequence_Fast_GET_ITEM(seq_, index_);
       ++index_;
       if (dict_ != nullptr) {
@@ -132,85 +275,32 @@ class ValIterator {
                           "Dictionary was modified during iteration over it");
           return nullptr;
         }
+      } else if (mapping_ != nullptr) {
+        element = PyObject_GetItem(mapping_, element);
+        if (element == nullptr) {
+          PyErr_SetString(PyExc_RuntimeError,
+                          "Mapping was modified during iteration over it");
+          return nullptr;
+        }
+        last_mapping_element_.reset(element);
       }
     }
     return element;
   }
 
  private:
-  PyObject* seq_;
+  // Special casing for things that pass PyDict_Check (faster, no Python calls)
   PyObject* dict_;
+
+  // General mappings which have custom Python logic
+  PyObject* mapping_;
+  Safe_PyObjectPtr last_mapping_element_;
+
+  PyObject* seq_;
   Py_ssize_t size_;
   Py_ssize_t index_;
 };
 
-mutex g_type_to_sequence_map(LINKER_INITIALIZED);
-std::unordered_map<PyTypeObject*, bool>* IsTypeSequenceMap() {
-  static auto* const m = new std::unordered_map<PyTypeObject*, bool>;
-  return m;
-}
-
-// Returns 1 if `o` is considered a sequence for the purposes of Flatten().
-// Returns 0 otherwise.
-// Returns -1 if an error occurred.
-int IsSequenceHelper(PyObject* o) {
-  if (PyDict_Check(o)) return true;
-  if (PySet_Check(o) && !WarnedThatSetIsNotSequence) {
-    LOG(WARNING) << "Sets are not currently considered sequences, "
-                    "but this may change in the future, "
-                    "so consider avoiding using them.";
-    WarnedThatSetIsNotSequence = true;
-  }
-  if (TF_PREDICT_FALSE(CollectionsSequenceType == nullptr)) {
-    PyErr_SetString(
-        PyExc_RuntimeError,
-        tensorflow::strings::StrCat(
-            "collections.Sequence type has not been set. "
-            "Please call RegisterSequenceClass before using this module")
-            .c_str());
-    return -1;
-  }
-
-  // Try not to return to Python - see if the type has already been seen
-  // before.
-
-  auto* type_to_sequence_map = IsTypeSequenceMap();
-  auto* type = Py_TYPE(o);
-
-  {
-    mutex_lock l(g_type_to_sequence_map);
-    auto it = type_to_sequence_map->find(type);
-    if (it != type_to_sequence_map->end()) {
-      return it->second;
-    }
-  }
-
-  // NOTE: We explicitly release the g_type_to_sequence_map mutex,
-  // because PyObject_IsInstance() may release the GIL, allowing another thread
-  // concurrent entry to this function.
-  int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
-
-  // Don't cache a failed is_instance check.
-  if (is_instance == -1) return -1;
-
-  bool is_sequence = static_cast<int>(is_instance != 0 && !IsString(o));
-
-  // NOTE: This is never decref'd, but we don't want the type to get deleted
-  // as long as it is in the map. This should not be too much of a
-  // leak, as there should only be a relatively small number of types in the
-  // map, and an even smaller number that are eligible for decref. As a
-  // precaution, we limit the size of the map to 1024.
-  {
-    mutex_lock l(g_type_to_sequence_map);
-    if (type_to_sequence_map->size() < kMaxItemsInCache) {
-      Py_INCREF(type);
-      type_to_sequence_map->insert({type, is_sequence});
-    }
-  }
-
-  return is_sequence;
-}
-
 bool IsSparseTensorValueType(PyObject* o) {
   if (TF_PREDICT_FALSE(SparseTensorValueType == nullptr)) {
     return false;
@@ -226,21 +316,35 @@ int IsSequenceForDataHelper(PyObject* o) {
 
 bool GetNextValuesForDict(PyObject* nested,
                           std::vector<Safe_PyObjectPtr>* next_values) {
-  std::vector<PyObject*> result;
-
-  PyObject* keys = PyDict_Keys(nested);
-  if (PyList_Sort(keys) == -1) return false;
-  Py_ssize_t size = PyList_Size(keys);
+  Safe_PyObjectPtr keys(PyDict_Keys(nested));
+  if (PyList_Sort(keys.get()) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys.get());
   for (Py_ssize_t i = 0; i < size; ++i) {
     // We know that key and item will not be deleted because nested owns
     // a reference to them and callers of flatten must not modify nested
     // while the method is running.
-    PyObject* key = PyList_GET_ITEM(keys, i);
+    PyObject* key = PyList_GET_ITEM(keys.get(), i);
     PyObject* item = PyDict_GetItem(nested, key);
     Py_INCREF(item);
     next_values->emplace_back(item);
   }
-  Py_DECREF(keys);
+  return true;
+}
+
+bool GetNextValuesForMapping(PyObject* nested,
+                             std::vector<Safe_PyObjectPtr>* next_values) {
+  Safe_PyObjectPtr keys(MappingKeys(nested));
+  if (keys.get() == nullptr) {
+    return false;
+  }
+  if (PyList_Sort(keys.get()) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys.get());
+  for (Py_ssize_t i = 0; i < size; ++i) {
+    PyObject* key = PyList_GET_ITEM(keys.get(), i);
+    // Unlike PyDict_GetItem, PyObject_GetItem returns a new reference.
+    PyObject* item = PyObject_GetItem(nested, key);
+    next_values->emplace_back(item);
+  }
   return true;
 }
 
@@ -265,6 +369,9 @@ bool GetNextValues(PyObject* nested,
   if (PyDict_Check(nested)) {
     // if nested is dictionary, sort it by key and recurse on each value
     return GetNextValuesForDict(nested, next_values);
+  } else if (IsMappingHelper(nested)) {
+    // same treatment as dictionaries, but for custom mapping types
+    return GetNextValuesForMapping(nested, next_values);
   }
   // iterate and recurse
   return GetNextValuesForIterable(nested, next_values);
@@ -276,6 +383,9 @@ bool GetNextValuesForData(PyObject* nested,
   if (PyDict_Check(nested)) {
     // if nested is dictionary, sort it by key and recurse on each value
     return GetNextValuesForDict(nested, next_values);
+  } else if (IsMappingHelper(nested)) {
+    // same treatment as dictionaries, but for custom mapping types
+    return GetNextValuesForMapping(nested, next_values);
   } else if (IsSparseTensorValueType(nested)) {
     // if nested is a SparseTensorValue, just return itself as a single item
     Py_INCREF(nested);
@@ -320,8 +430,8 @@ bool FlattenHelper(
 // 'dict1' and 'dict2' are assumed to be Python dictionaries.
 void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
                            bool* is_type_error) {
-  PyObject* k1 = PyDict_Keys(dict1);
-  PyObject* k2 = PyDict_Keys(dict2);
+  PyObject* k1 = MappingKeys(dict1);
+  PyObject* k2 = MappingKeys(dict2);
   *is_type_error = false;
   *error_msg = tensorflow::strings::StrCat(
       "The two dictionaries don't have the same set of keys. "
@@ -396,9 +506,12 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
       }
     } else if (type1 != type2
                /* If both sequences are list types, don't complain. This allows
-               one to be a list subclass (e.g. _ListWrapper used for automatic
-               dependency tracking.) */
-               && !(PyList_Check(o1) && PyList_Check(o2))) {
+                  one to be a list subclass (e.g. _ListWrapper used for
+                  automatic dependency tracking.) */
+               && !(PyList_Check(o1) && PyList_Check(o2))
+               /* Two mapping types will also compare equal, making _DictWrapper
+                  and dict compare equal. */
+               && !(IsMappingHelper(o1) && IsMappingHelper(o2))) {
       *is_type_error = true;
       *error_msg = tensorflow::strings::StrCat(
           "The two namedtuples don't have the same sequence type. "
@@ -423,6 +536,24 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
           return true;
         }
       }
+    } else if (IsMappingHelper(o1)) {
+      // Fallback for custom mapping types. Instead of using PyDict methods
+      // which stay in C, we call iter(o1).
+      if (PyMapping_Size(o1) != PyMapping_Size(o2)) {
+        SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+        return true;
+      }
+
+      Safe_PyObjectPtr iter(PyObject_GetIter(o1));
+      PyObject* key;
+      while ((key = PyIter_Next(iter.get())) != nullptr) {
+        if (!PyMapping_HasKey(o2, key)) {
+          SetDifferentKeysError(o1, o2, error_msg, is_type_error);
+          Py_DECREF(key);
+          return true;
+        }
+        Py_DECREF(key);
+      }
     }
   }
 
@@ -470,6 +601,19 @@ void RegisterSequenceClass(PyObject* sequence_class) {
   CollectionsSequenceType = sequence_class;
 }
 
+void RegisterMappingClass(PyObject* mapping_class) {
+  if (!PyType_Check(mapping_class)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat(
+            "Expecting a class definition for `collections.Mapping`. Got ",
+            Py_TYPE(mapping_class)->tp_name)
+            .c_str());
+    return;
+  }
+  CollectionsMappingType = mapping_class;
+}
+
 void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
   if (!PyType_Check(sparse_tensor_value_class)) {
     PyErr_SetString(
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index 70efc10c9abe7c57da61311bb2eb7ae362a48e3d..41dcc969f88f8bb81b70b734cbf41fe10135a9c0 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -118,7 +118,9 @@ PyObject* Flatten(PyObject* nested);
 // the type from the module. This approach also requires some trigger from
 // Python so that we know that Python interpreter had been initialzied.
 void RegisterSequenceClass(PyObject* sequence_class);
-// Similar to the above function, except for the
+// Like RegisterSequenceClass, but for collections.Mapping.
+void RegisterMappingClass(PyObject* mapping_class);
+// Similar to the above functions, except for the
 // sparse_tensor.SparseTensorValue class.
 void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
 
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index 9f3b11b982bb0d52f903b09975cc7029fa8cb013..6ad148429541a6855a4f1da5a5c89d6479f53f39 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -31,6 +31,9 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterSequenceClass;
 %noexception tensorflow::swig::RegisterSequenceClass;
 
+%unignore tensorflow::swig::RegisterMappingClass;
+%noexception tensorflow::swig::RegisterMappingClass;
+
 %unignore tensorflow::swig::RegisterSparseTensorValueClass;
 %noexception tensorflow::swig::RegisterSparseTensorValueClass;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 766a0dafb5c6c4d1446a7b7fc61751c4c534f2ec..1c3940e92ce506e3fd73f0896995320588965cab 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -322,6 +322,7 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
 CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
+  ScopedActivateExecutorContext context(parent_);
   cudnnHandle_t cudnn_handle = nullptr;
   auto status = cudnnCreate(&cudnn_handle);
   if (status == CUDNN_STATUS_SUCCESS) {
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index d508f6594a9f9ac3c924b0b952620b6a4ac727ea..dbece3adf938da95d550f32da14cd5f67ff802c2 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -102,117 +102,16 @@ class CreatedContexts {
 /* static */ int64 CreatedContexts::next_id_ = 1;  // 0 means "no context"
 
 // Formats CUresult to output prettified values into a log stream.
-// Error summaries taken from:
-// http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TYPES.html#group__CUDA__TYPES_1gc6c391505e117393cc2558fff6bfc2e9
-//
-// TODO(leary) switch to cuGetErrorName when updated cuda.h is available.
 string ToString(CUresult result) {
-#define OSTREAM_CUDA_ERROR(__name) \
-  case CUDA_ERROR_##__name:        \
-    return "CUDA_ERROR_" #__name;
-
-///////////////
-// NOTE: here we specify return code values outside of the enum explicitly
-// because our in-tree cuda.h is from the CUDA 5.5 SDK, but CUDA 6.0+ driver
-// libraries are deployed in the fleet these error codes are backwards
-// compatible, but if we see a "new" one, we want to be able to identify it in
-// the logs.
-//
-// Once we get a cuda.h that has cuGetErrorName (TODO is above) we can
-// eliminate this function and just rely on the driver to provide us these
-// strings.
-//
-// NOTE: "Must reboot all context" below is shorthand for, "must
-// destroy/recreate the offending context and any allocation which come from
-// it if you are to continue using CUDA."
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wswitch"
-  switch (result) {
-    OSTREAM_CUDA_ERROR(INVALID_VALUE)
-    OSTREAM_CUDA_ERROR(OUT_OF_MEMORY)
-    OSTREAM_CUDA_ERROR(NOT_INITIALIZED)
-    OSTREAM_CUDA_ERROR(DEINITIALIZED)
-    OSTREAM_CUDA_ERROR(NO_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_DEVICE)
-    OSTREAM_CUDA_ERROR(INVALID_IMAGE)
-    OSTREAM_CUDA_ERROR(INVALID_CONTEXT)
-    OSTREAM_CUDA_ERROR(INVALID_HANDLE)
-    OSTREAM_CUDA_ERROR(NOT_FOUND)
-    OSTREAM_CUDA_ERROR(NOT_READY)
-    OSTREAM_CUDA_ERROR(NO_BINARY_FOR_GPU)
-
-    // Encountered an uncorrectable ECC error during execution.
-    OSTREAM_CUDA_ERROR(ECC_UNCORRECTABLE)
-
-    // Load/store on an invalid address. Must reboot all context.
-    case 700:
-      return "CUDA_ERROR_ILLEGAL_ADDRESS";
-    // Passed too many / wrong arguments, too many threads for register count.
-    case 701:
-      return "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES";
-    // Kernel took too long to execute.
-    case 702:
-      return "CUDA_ERROR_LAUNCH_TIMEOUT";
-    // Kernel launch uses an incompatible texturing mode.
-    case 703:
-      return "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING";
-    // Trying to re-enable peer access that already has it enabled.
-    case 704:
-      return "CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED";
-    // Trying to disable peer access that has not yet been enabled.
-    case 705:
-      return "CUDA_ERROR_PEER_ACCESS_NOT_ENABLED";
-    // Primary context for the specified device has already been initialized.
-    case 708:
-      return "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE";
-    // Context current to calling thread has been destroyed or is a primary
-    // context that has not yet been initialized.
-    case 709:
-      return "CUDA_ERROR_CONTEXT_IS_DESTROYED";
-    // Device-side assert triggered during kernel execution. Must reboot all
-    // context.
-    case 710:
-      return "CUDA_ERROR_ASSERT";
-    // Hardware resources to enable peer access have been exhausted.
-    case 711:
-      return "CUDA_ERROR_TOO_MANY_PEERS";
-    // Memory range has already been registered.
-    case 712:
-      return "CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED";
-    // Pointer does not correspond to any currently registered memory region.
-    case 713:
-      return "CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED";
-    // Due to stack corruption or exceeding stack size limit. Must reboot all
-    // context.
-    case 714:
-      return "CUDA_ERROR_HARDWARE_STACK_ERROR";
-    case 715:
-      return "CUDA_ERROR_ILLEGAL_INSTRUCTION";
-    // Load/store on an unaligned memory address. Must reboot all context.
-    case 716:
-      return "CUDA_ERROR_MISALIGNED_ADDRESS";
-    // Device instruction with specific address space given address not
-    // belonging to allowed address space. Must reboot all context.
-    case 717:
-      return "CUDA_ERROR_INVALID_ADDRESS_SPACE";
-    // Device program counter wrapped its address space. Must reboot all
-    // context.
-    case 718:
-      return "CUDA_ERROR_INVALID_PC";
-    // Exception on device while executing a kernel; e.g. deref invalid device
-    // pointer, accessing OOB shared memory. Must reboot all context.
-    case 719:
-      return "CUDA_ERROR_LAUNCH_FAILED";
-
-      OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
-      OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
-      OSTREAM_CUDA_ERROR(NOT_PERMITTED)
-      OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
-      OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
-    default:
-      return port::StrCat("CUresult(", static_cast<int>(result), ")");
+  const char *error_name;
+  if (cuGetErrorName(result, &error_name)) {
+    return port::StrCat("UNKNOWN ERROR (", static_cast<int>(result), ")");
+  }
+  const char *error_string;
+  if (cuGetErrorString(result, &error_string)) {
+    return error_name;
   }
-#pragma GCC diagnostic pop
+  return port::StrCat(error_name, ": ", error_string);
 }
 
 // Returns the current context and checks that it is in the set of CUDA contexts
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index 259c813c57976b542507d94e91ce264c50d42d18..73f05b94db8f022825ccc72c2222a78634423ddf 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -206,6 +206,48 @@ static string GetBinaryDir(bool strip_exe) {
   return exe_path;
 }
 
+bool CUDAExecutor::LoadModuleFromCuBin(const char *cubin, CUmodule *module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[cubin];
+
+  if (*module == nullptr) {
+    auto load_status = CUDADriver::LoadCubin(context_, cubin, module);
+    if (!load_status.ok()) {
+      LOG(ERROR) << "failed to load CUBIN: " << load_status;
+      return false;
+    }
+    module_refcount = 1;
+    VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
+            << " as module " << *module;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
+            << " is already loaded as module " << *module;
+  }
+  gpu_binary_to_module_[cubin] = {*module, module_refcount};
+  return true;
+}
+
+bool CUDAExecutor::LoadModuleFromPtx(const char *ptx, CUmodule *module) {
+  uint64_t module_refcount;
+  std::tie(*module, module_refcount) = gpu_binary_to_module_[ptx];
+
+  if (*module == nullptr) {
+    if (!CUDADriver::LoadPtx(context_, ptx, module)) {
+      return false;
+    }
+    VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx) << " as module "
+            << *module;
+    module_refcount = 1;
+  } else {
+    ++module_refcount;
+    VLOG(3) << "PTX " << static_cast<const void *>(ptx)
+            << " is already loaded as module " << module;
+  }
+  gpu_binary_to_module_[ptx] = {*module, module_refcount};
+  return true;
+}
+
 bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
                              KernelBase *kernel) {
   CUDAKernel *cuda_kernel = AsCUDAKernel(kernel);
@@ -215,28 +257,13 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   VLOG(3) << "GetKernel on kernel " << kernel << " : " << kernel->name();
 
   if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
     kernelname = &spec.cuda_cubin_in_memory().kernelname();
     const char *cubin = spec.cuda_cubin_in_memory().bytes();
-    mutex_lock lock{in_memory_modules_mu_};
-    uint64_t module_refcount;
-    std::tie(module, module_refcount) = gpu_binary_to_module_[cubin];
-
-    if (module == nullptr) {
-      auto load_status = CUDADriver::LoadCubin(context_, cubin, &module);
-      if (!load_status.ok()) {
-        LOG(ERROR) << "failed to load CUBIN: " << load_status;
-        return false;
-      }
-      module_refcount = 1;
-      VLOG(3) << "Loaded CUBIN " << static_cast<const void *>(cubin)
-              << " as module " << module;
-    } else {
-      ++module_refcount;
-      VLOG(3) << "CUBIN " << static_cast<const void *>(cubin)
-              << " is already loaded as module " << module;
+    if (!LoadModuleFromCuBin(cubin, &module)) {
+      return false;
     }
     kernel_to_gpu_binary_[kernel] = cubin;
-    gpu_binary_to_module_[cubin] = {module, module_refcount};
   } else if (spec.has_cuda_ptx_in_memory()) {
     kernelname = &spec.cuda_ptx_in_memory().kernelname();
 
@@ -254,24 +281,10 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
     }
 
     mutex_lock lock{in_memory_modules_mu_};
-    uint64_t module_refcount;
-    std::tie(module, module_refcount) = gpu_binary_to_module_[ptx];
-
-    if (module == nullptr) {
-      if (!CUDADriver::LoadPtx(context_, ptx, &module)) {
-        LOG(ERROR) << "failed to load PTX for kernel " << *kernelname;
-        return false;
-      }
-      VLOG(3) << "Loaded PTX " << static_cast<const void *>(ptx)
-              << " as module " << module;
-      module_refcount = 1;
-    } else {
-      ++module_refcount;
-      VLOG(3) << "PTX " << static_cast<const void *>(ptx)
-              << " is already loaded as module " << module;
+    if (!LoadModuleFromPtx(ptx, &module)) {
+      return false;
     }
     kernel_to_gpu_binary_[kernel] = ptx;
-    gpu_binary_to_module_[ptx] = {module, module_refcount};
   } else {
     LOG(WARNING) << "no method of loading CUDA kernel provided";
     return false;
@@ -295,6 +308,23 @@ bool CUDAExecutor::GetKernel(const MultiKernelLoaderSpec &spec,
   return true;
 }
 
+bool CUDAExecutor::UnloadGpuBinary(const void *gpu_binary) {
+  auto module_it = gpu_binary_to_module_.find(gpu_binary);
+  if (gpu_binary_to_module_.end() == module_it) {
+    VLOG(3) << "No loaded CUDA module for " << gpu_binary;
+    return false;
+  }
+  auto &module = module_it->second.first;
+  auto &refcount = module_it->second.second;
+  VLOG(3) << "Found CUDA module " << module << " with refcount " << refcount;
+  if (--refcount == 0) {
+    VLOG(3) << "Unloading CUDA module " << module;
+    CUDADriver::UnloadModule(context_, module);
+    gpu_binary_to_module_.erase(module_it);
+  }
+  return true;
+}
+
 void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   VLOG(3) << "Unloading kernel " << kernel << " : " << kernel->name();
 
@@ -307,25 +337,52 @@ void CUDAExecutor::UnloadKernel(const KernelBase *kernel) {
   }
   VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
           << " has loaded GPU code " << gpu_binary_it->second;
-  auto module_it = gpu_binary_to_module_.find(gpu_binary_it->second);
-  if (gpu_binary_to_module_.end() == module_it) {
-    VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
-            << " has no loaded CUDA module.";
-    return;  // This kernel never loaded any modules
-  }
-  auto &module = module_it->second.first;
-  auto &refcount = module_it->second.second;
-  VLOG(3) << "Kernel " << kernel << " : " << kernel->name()
-          << " has loaded GPU code " << gpu_binary_it->second
-          << " into CUDA module " << module << " with refcount " << refcount;
-  if (--refcount == 0) {
-    VLOG(3) << "Unloading CUDA module " << module;
-    CUDADriver::UnloadModule(context_, module);
-    gpu_binary_to_module_.erase(module_it);
-  }
+  UnloadGpuBinary(gpu_binary_it->second);
   kernel_to_gpu_binary_.erase(gpu_binary_it);
 }
 
+bool CUDAExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+                              ModuleHandle *module_handle) {
+  // In CUDAExecutor we store the pointer to the GPU binary (PTX or CUBIN) as
+  // ModuleHandle::id().
+  CUmodule cu_module;
+  if (spec.has_cuda_cubin_in_memory()) {
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromCuBin(
+            reinterpret_cast<const char *>(spec.cuda_cubin_in_memory().data()),
+            &cu_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void *>(
+        static_cast<const void *>(spec.cuda_cubin_in_memory().data())));
+    return true;
+  } else if (spec.has_cuda_ptx_in_memory()) {
+    if (cc_major_ == 0 && cc_minor_ == 0) {
+      return false;
+    }
+
+    if (!spec.cuda_ptx_in_memory()) {
+      return false;
+    }
+
+    mutex_lock lock{in_memory_modules_mu_};
+    if (!LoadModuleFromPtx(spec.cuda_ptx_in_memory(), &cu_module)) {
+      return false;
+    }
+    *module_handle = ModuleHandle(const_cast<void *>(
+        static_cast<const void *>(spec.cuda_ptx_in_memory())));
+    return true;
+  }
+  LOG(WARNING) << "no method of loading CUDA module provided";
+  return false;
+}
+
+bool CUDAExecutor::UnloadModule(ModuleHandle module_handle) {
+  const char *gpu_binary = reinterpret_cast<const char *>(module_handle.id());
+  mutex_lock lock{in_memory_modules_mu_};
+  return UnloadGpuBinary(gpu_binary);
+}
+
 bool CUDAExecutor::GetKernelMetadata(CUDAKernel *cuda_kernel,
                                      KernelMetadata *kernel_metadata) {
   int value;
@@ -783,16 +840,26 @@ bool CUDAExecutor::DeviceMemoryUsage(int64 *free, int64 *total) const {
   return CUDADriver::GetDeviceMemoryInfo(context_, free, total);
 }
 
-bool CUDAExecutor::GetSymbol(const string& symbol_name, void **mem,
+bool CUDAExecutor::GetSymbol(const string &symbol_name,
+                             ModuleHandle module_handle, void **mem,
                              size_t *bytes) {
+  auto lookup_in_module = [&](CUmodule module) {
+    CHECK(module != nullptr);
+    return CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
+                                       reinterpret_cast<CUdeviceptr *>(mem),
+                                       bytes);
+  };
+
   {  // give limited scope to mutex_lock
     mutex_lock lock{in_memory_modules_mu_};
+    if (static_cast<bool>(module_handle)) {
+      auto it = gpu_binary_to_module_.find(module_handle.id());
+      CHECK(it != gpu_binary_to_module_.end());
+      return lookup_in_module(it->second.first);
+    }
+
     for (auto &it : gpu_binary_to_module_) {
-      CUmodule module = it.second.first;
-      CHECK(module != nullptr);
-      if (CUDADriver::GetModuleSymbol(context_, module, symbol_name.c_str(),
-                                      reinterpret_cast<CUdeviceptr *>(mem),
-                                      bytes)) {
+      if (lookup_in_module(it.second.first)) {
         return true;
       }
     }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index f7c341c857bd68446021d1fb274b4f4bed8eacbb..8a954d5461c60749019c87971cee22089bbd22e5 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -62,6 +62,9 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   bool GetKernel(const MultiKernelLoaderSpec &spec,
                  KernelBase *kernel) override;
   void UnloadKernel(const KernelBase *kernel) override;
+  bool LoadModule(const MultiModuleLoaderSpec &spec,
+                  ModuleHandle *module_handle) override;
+  bool UnloadModule(ModuleHandle module_handle) override;
 
   bool Launch(Stream *stream, const ThreadDim &thread_dims,
               const BlockDim &block_dims, const KernelBase &k,
@@ -175,7 +178,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   // Search for the symbol and returns a device pointer and size.
   // Returns false if symbol does not exist.
-  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) override;
+  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                 void **mem, size_t *bytes) override;
 
   DeviceDescription *PopulateDeviceDescription() const override;
 
@@ -239,6 +243,16 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
   void VlogOccupancyInfo(const KernelBase &kernel, const ThreadDim &thread_dims,
                          const BlockDim &block_dims);
 
+  bool LoadModuleFromCuBin(const char *cubin, CUmodule *module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  // Loads the PTX text `ptx` as a CUDA module.  `ptx` must be null terminated.
+  bool LoadModuleFromPtx(const char *ptx, CUmodule *module)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
+  bool UnloadGpuBinary(const void *gpu_binary)
+      EXCLUSIVE_LOCKS_REQUIRED(in_memory_modules_mu_);
+
   // Guards the in-memory-module mapping.
   mutex in_memory_modules_mu_;
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 3cd97b3cf165520e236ff6a1ce9280426fe5ed1f..8adf739b170c42e5aeda5ccf3ea469f2c3cea07c 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -93,7 +93,7 @@ bool HostExecutor::MemcpyDeviceToDevice(Stream *stream,
   // the nature of the HostExecutor) memcpy  on the stream (HostStream)
   // associated with the HostExecutor.
   AsHostStream(stream)->EnqueueTask(
-      [src_mem, dst_mem, size]() { memcpy(src_mem, dst_mem, size); });
+      [src_mem, dst_mem, size]() { memcpy(dst_mem, src_mem, size); });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/module_spec.h b/tensorflow/stream_executor/module_spec.h
new file mode 100644
index 0000000000000000000000000000000000000000..75bdfed2d70364da4b191804d1e0973ee2658b70
--- /dev/null
+++ b/tensorflow/stream_executor/module_spec.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+#define TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
+
+#include "tensorflow/stream_executor/lib/array_slice.h"
+#include "tensorflow/stream_executor/lib/stringpiece.h"
+#include "tensorflow/stream_executor/platform/logging.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace stream_executor {
+
+// Describes how to load a module on a target platform.
+//
+// The exact meaning of a "module" may differ from platform to platform but
+// loosely speaking a module a collection of kernels and global variables.  It
+// corresponds to CUmodule when running on CUDA.
+class MultiModuleLoaderSpec {
+ public:
+  bool has_cuda_cubin_in_memory() const { return has_cuda_cubin_in_memory_; }
+  port::ArraySlice<const uint8> cuda_cubin_in_memory() const {
+    CHECK(has_cuda_cubin_in_memory());
+    return {cuda_cubin_in_memory_.data(), cuda_cubin_in_memory_.size()};
+  }
+
+  bool has_cuda_ptx_in_memory() const { return has_cuda_ptx_in_memory_; }
+  const char* cuda_ptx_in_memory() const {
+    CHECK(has_cuda_ptx_in_memory());
+    return cuda_ptx_in_memory_;
+  }
+
+  void AddCudaCubinInMemory(port::ArraySlice<const uint8> cubin_bytes) {
+    CHECK(!cubin_bytes.empty());
+    has_cuda_cubin_in_memory_ = true;
+    cuda_cubin_in_memory_ = cubin_bytes;
+  }
+
+  void AddCudaPtxInMemory(const char* ptx) {
+    has_cuda_ptx_in_memory_ = true;
+    // The CUDA driver does not like getting an empty string as PTX.
+    cuda_ptx_in_memory_ = *ptx ? ptx : nullptr;
+  }
+
+ private:
+  port::ArraySlice<const uint8> cuda_cubin_in_memory_;
+  bool has_cuda_cubin_in_memory_ = false;
+  const char* cuda_ptx_in_memory_;
+  bool has_cuda_ptx_in_memory_ = false;
+};
+
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_MODULE_SPEC_H_
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index ca1b8e28e6da59fc6fcdfae02d3f652c362ace59..6248aa2d01b0e9cccd89caf73444485f89c69abf 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -268,6 +268,12 @@ Stream::~Stream() {
   VLOG_CALL();
 
   temporary_memory_manager_.ForceDeallocateAll();
+  // Ensure the stream is completed.
+  auto status = BlockHostUntilDone();
+  if (!status.ok()) {
+    LOG(WARNING) << "Error blocking host until done in stream destructor: "
+                 << status;
+  }
 
   if (allocated_) {
     parent_->DeallocateStream(this);
@@ -1935,7 +1941,14 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
   mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.first.get() == sub_stream) {
-      stream.second = true;
+      // Streams have a monotonic state machine; if a stream
+      // encounters an error, it will remain in an error state
+      // forever. Only allow re-use of ok streams.
+      //
+      // TODO(toddw): Improve this mechanism, if necessary, to drop
+      // failed streams completely.
+      const bool ready_to_reuse = sub_stream->ok();
+      stream.second = ready_to_reuse;
       return;
     }
   }
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 63d64947c835bb187919901abc35c640f724e79b..706442a6662429edbe65ea94b933777694e9b2be 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -125,7 +125,7 @@ class Stream {
   Stream *GetOrCreateSubStream() LOCKS_EXCLUDED(mu_);
 
   // Return the sub-stream back to the host stream so that it can be reused
-  // later.
+  // later. Sub-streams that are !ok() will not be reused.
   void ReturnSubStream(Stream *sub_stream) LOCKS_EXCLUDED(mu_);
 
   // Allocate temporary memories. The stream will deallocate them when blocked
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index fb1b92cb84a1295e4bde0d24bc40b2629930b711..f34b1fc083adec40d57bf65cb49a4e7901ee1864 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -36,20 +36,38 @@ limitations under the License.
 #include "tensorflow/stream_executor/kernel_cache_config.h"
 #include "tensorflow/stream_executor/kernel_spec.h"
 #include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/module_spec.h"
 #include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/plugin_registry.h"
 #include "tensorflow/stream_executor/shared_memory_config.h"
 #include "tensorflow/stream_executor/trace_listener.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 namespace stream_executor {
 
 class Stream;
 class Timer;
 
+// An opaque handle to a loaded module.
+//
+// An instance of this is returned from StreamExecutor::GetModule.
+class ModuleHandle {
+ public:
+  /*implicit*/ ModuleHandle(void *id = nullptr) : id_(id) {}
+
+  // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
+  // null pointer.
+  void *id() const { return id_; }
+
+  explicit operator bool() const { return id() != nullptr; }
+
+ private:
+  void *id_;
+};
+
 namespace internal {
 
 // Platform-dependent interface class for the generic Events interface, in
@@ -164,6 +182,11 @@ class StreamExecutorInterface {
                          KernelBase *kernel) {
     return false;
   }
+  virtual bool LoadModule(const MultiModuleLoaderSpec &spec,
+                          ModuleHandle *module_handle) {
+    return false;
+  }
+  virtual bool UnloadModule(ModuleHandle module_handle) { return false; }
   virtual bool Launch(Stream *stream, const ThreadDim &thread_dims,
                       const BlockDim &block_dims, const KernelBase &k,
                       const KernelArgsArrayBase &args) {
@@ -247,7 +270,12 @@ class StreamExecutorInterface {
   // null, however, both of them cannot be null at the same time. To use
   // constant memory in CUDA, GetSymbol has to be used. Returns true if symbol
   // is found.
-  virtual bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes) {
+  //
+  // If ModuleHandle is set then we search for `symbol_name` only within the
+  // module corresponding to `module_handle`.  Otherwise all loaded modules are
+  // searched.
+  virtual bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                         void **mem, size_t *bytes) {
     return false;
   }
 
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 000795ff0048dddb0eb4a08956e6de6f5e336f28..2e0137a485e77ef6bd62d07e334cbdc41132ce96 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -220,6 +220,15 @@ void StreamExecutor::UnloadKernel(const KernelBase *kernel) {
   implementation_->UnloadKernel(kernel);
 }
 
+bool StreamExecutor::LoadModule(const MultiModuleLoaderSpec &spec,
+                                ModuleHandle *module_handle) {
+  return implementation_->LoadModule(spec, module_handle);
+}
+
+bool StreamExecutor::UnloadModule(ModuleHandle module_handle) {
+  return implementation_->UnloadModule(module_handle);
+}
+
 void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
   VLOG(1) << "Called StreamExecutor::Deallocate(mem=" << mem->opaque()
           << ") mem->size()=" << mem->size() << StackTraceIfVLOG10();
@@ -459,9 +468,34 @@ void *StreamExecutor::Allocate(uint64 size) {
   return buf;
 }
 
-bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
+port::StatusOr<DeviceMemoryBase> StreamExecutor::GetUntypedSymbol(
+    const string &symbol_name, ModuleHandle module_handle) {
+  // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
+  // be nullptr/0 for consistency with DeviceMemory semantics.
+  void *opaque = nullptr;
+  size_t bytes = 0;
+  if (GetSymbol(symbol_name, module_handle, &opaque, &bytes)) {
+    return DeviceMemoryBase(opaque, bytes);
+  }
+
+  if (static_cast<bool>(module_handle)) {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::StrCat("Check if module containing symbol ", symbol_name,
+                     " is loaded (module_handle = ",
+                     reinterpret_cast<uintptr_t>(module_handle.id()), ")"));
+  } else {
+    return port::Status(
+        port::error::NOT_FOUND,
+        port::StrCat("Check if kernel using the symbol is loaded: ",
+                     symbol_name));
+  }
+}
+
+bool StreamExecutor::GetSymbol(const string &symbol_name,
+                               ModuleHandle module_handle, void **mem,
                                size_t *bytes) {
-  return implementation_->GetSymbol(symbol_name, mem, bytes);
+  return implementation_->GetSymbol(symbol_name, module_handle, mem, bytes);
 }
 
 void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ad80a1ba259ce0c6e2785373cc986b8bf34f6460..47b3a2b030ca68a079a1f9de238a2ed58f18b7e8 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -106,6 +106,16 @@ class StreamExecutor {
   // Releases any state associated with the previously loaded kernel.
   void UnloadKernel(const KernelBase *kernel);
 
+  // Loads a module for the platform this StreamExecutor is acting upon.
+  //
+  // `spec` describes the module to be loaded.  On success writes the handle for
+  // the loaded module to `module_handle` and returns true.  Else returns false.
+  bool LoadModule(const MultiModuleLoaderSpec &spec,
+                  ModuleHandle *module_handle);
+
+  // Unloads the module with handle `module_handle`.
+  bool UnloadModule(ModuleHandle module_handle);
+
   // Synchronously allocates an array on the device of type T with element_count
   // elements.
   template <typename T>
@@ -169,8 +179,16 @@ class StreamExecutor {
   // type of symbol and T match.
   // - Note: symbol_name should include its namespace as well. For example,
   //         pass "nms0::symbol" if referring to nms0::symbol.
+  //
+  // If `module_handle` is set then searches only within the module
+  // corresponding to `module_handle`.
   template <typename T>
-  port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name);
+  port::StatusOr<DeviceMemory<T>> GetSymbol(const string &symbol_name,
+                                            ModuleHandle module_handle = {});
+
+  // An untyped version of GetSymbol.
+  port::StatusOr<DeviceMemoryBase> GetUntypedSymbol(
+      const string &symbol_name, ModuleHandle module_handle = {});
 
   // Deallocate the DeviceMemory previously allocated via this interface.
   // Deallocation of a nullptr-representative value is permitted.
@@ -507,7 +525,8 @@ class StreamExecutor {
 
   // Finds and retrieves device memory for the symbol on the underlying
   // platform.
-  bool GetSymbol(const string& symbol_name, void **mem, size_t *bytes);
+  bool GetSymbol(const string &symbol_name, ModuleHandle module_handle,
+                 void **mem, size_t *bytes);
 
   // Entrains a memcpy operation onto stream, with a host destination location
   // host_dst and a device memory source, with target size size.
@@ -678,6 +697,41 @@ class StreamExecutor {
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
 
+// A wrapper around ModuleHandle that uses RAII to manage its lifetime.
+class ScopedModuleHandle {
+ public:
+  explicit ScopedModuleHandle(StreamExecutor *executor,
+                              ModuleHandle module_handle)
+      : executor_(executor), module_handle_(module_handle) {}
+
+  ScopedModuleHandle(ScopedModuleHandle &&other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+  }
+
+  ScopedModuleHandle &operator=(ScopedModuleHandle &&other) {
+    executor_ = other.executor_;
+    module_handle_ = other.module_handle_;
+    other.executor_ = nullptr;
+    other.module_handle_ = ModuleHandle();
+    return *this;
+  }
+
+  ~ScopedModuleHandle() {
+    if (static_cast<bool>(module_handle_)) {
+      CHECK(executor_->UnloadModule(module_handle_));
+    }
+  }
+
+ private:
+  StreamExecutor *executor_;
+  ModuleHandle module_handle_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ScopedModuleHandle);
+};
+
 ////////////
 // Inlines
 
@@ -690,19 +744,13 @@ inline DeviceMemory<T> StreamExecutor::AllocateArray(uint64 element_count) {
 
 template <typename T>
 inline port::StatusOr<DeviceMemory<T>> StreamExecutor::GetSymbol(
-    const string &symbol_name) {
-  // If failed to get the symbol, opaque/bytes are unchanged. Initialize them to
-  // be nullptr/0 for consistency with DeviceMemory semantics.
-  void *opaque = nullptr;
-  size_t bytes = 0;
-  if (GetSymbol(symbol_name, &opaque, &bytes)) {
-    CHECK_EQ(bytes % sizeof(T), 0);
-    return DeviceMemory<T>::MakeFromByteSize(opaque, bytes);
+    const string &symbol_name, ModuleHandle module_handle) {
+  port::StatusOr<DeviceMemoryBase> untyped_symbol =
+      GetUntypedSymbol(symbol_name, module_handle);
+  if (!untyped_symbol.ok()) {
+    return untyped_symbol.status();
   }
-  return port::Status(
-      port::error::NOT_FOUND,
-      port::StrCat("Check if kernel using the symbol is loaded: ",
-                   symbol_name));
+  return DeviceMemory<T>(untyped_symbol.ValueOrDie());
 }
 
 template <typename ElemT>
diff --git a/tensorflow/stream_executor/stream_test.cc b/tensorflow/stream_executor/stream_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..47dd67583497463897d0a740d81c9012a0ff9452
--- /dev/null
+++ b/tensorflow/stream_executor/stream_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/stream_executor/stream_executor.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace stream_executor {
+namespace {
+
+class StreamTest : public ::testing::Test {
+ protected:
+  std::unique_ptr<StreamExecutor> NewStreamExecutor() {
+    Platform* platform =
+        MultiPlatformManager::PlatformWithName("Host").ConsumeValueOrDie();
+    StreamExecutorConfig config(/*ordinal=*/0);
+    return platform->GetUncachedExecutor(config).ConsumeValueOrDie();
+  }
+};
+
+TEST_F(StreamTest, NoInitNotOk) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  EXPECT_FALSE(stream.ok());
+}
+
+TEST_F(StreamTest, InitOk) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+}
+
+TEST_F(StreamTest, OneSubStream) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get and return a sub-stream. Sub-streams are always initialized.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+  stream.ReturnSubStream(sub_stream1);
+
+  // Get and return another sub-stream.
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+  stream.ReturnSubStream(sub_stream1);
+
+  // The underlying sub-streams should be the same, since sub_stream1
+  // was returned before we tried to get sub_stream2.
+  EXPECT_EQ(sub_stream1, sub_stream2);
+}
+
+TEST_F(StreamTest, TwoSubStreams) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get two sub-streams.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+
+  // The underlying sub-streams should be different, since neither
+  // sub-stream has been returned.
+  EXPECT_NE(sub_stream1, sub_stream2);
+
+  // Return sub_stream1 and get sub_stream3, which should be the same.
+  stream.ReturnSubStream(sub_stream1);
+  Stream* sub_stream3 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream3->ok());
+  EXPECT_EQ(sub_stream1, sub_stream3);
+  EXPECT_NE(sub_stream2, sub_stream3);
+
+  // Return sub_stream2 and get sub_stream4, which should be the same.
+  stream.ReturnSubStream(sub_stream2);
+  Stream* sub_stream4 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream4->ok());
+  EXPECT_EQ(sub_stream2, sub_stream4);
+  EXPECT_NE(sub_stream3, sub_stream4);
+}
+
+TEST_F(StreamTest, FailedSubStreamNotReused) {
+  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
+  Stream stream(executor.get());
+  stream.Init();
+  EXPECT_TRUE(stream.ok());
+
+  // Get a sub-stream.
+  Stream* sub_stream1 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream1->ok());
+
+  // Force an error on the stream; here we call a method that requires
+  // DNN support, which we know the Host platform doesn't support.
+  sub_stream1->ThenDepthConcatenate({}, {}, nullptr);
+  EXPECT_FALSE(sub_stream1->ok());
+
+  // Return sub_stream1 and get sub_stream2.
+  stream.ReturnSubStream(sub_stream1);
+  Stream* sub_stream2 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream2->ok());
+
+  // The underlying streams should be different. They would have been
+  // the same, but since we forced an error on sub_stream1, it will
+  // not be re-used. Sadly we can't just check:
+  //   EXPECT_NE(sub_stream1, sub_stream2);
+  //
+  // The above should hold logically, but it may fail if the new
+  // stream instance allocated for sub_stream2 happens to reside in
+  // the same memory address as sub_stream1.
+  //
+  // The check that sub_stream2->ok() serves as a good-enough check.
+
+  // Return sub_stream2 and get sub_stream3. The previous error on
+  // sub_stream1 has no effect on these streams, and they are the
+  // same.
+  stream.ReturnSubStream(sub_stream2);
+  Stream* sub_stream3 = stream.GetOrCreateSubStream();
+  EXPECT_TRUE(sub_stream3->ok());
+  EXPECT_EQ(sub_stream2, sub_stream3);
+}
+
+}  // namespace
+}  // namespace stream_executor
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 955b53f691c418f4ebcf120caf9f00e90a4f645e..58282ec1c71cb04accea1bafa728808c2f60d315 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -9,6 +9,7 @@ load(
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "if_static",
+    "if_dynamic_kernels",
 )
 load(
     "@local_config_tensorrt//:build_defs.bzl",
@@ -137,6 +138,14 @@ def if_not_mobile(a):
       "//conditions:default": a,
   })
 
+# Config setting selector used when building for products
+# which requires restricted licenses to be avoided.
+def if_not_lgpl_restricted(a):
+  _ = (a,)
+  return select({
+      "//conditions:default": [],
+  })
+
 def if_not_windows(a):
   return select({
       clean_dep("//tensorflow:windows"): [],
@@ -236,6 +245,7 @@ def tf_copts(android_optimization_level_override="-O2", is_external=False):
             clean_dep("//tensorflow:windows"): get_win_copts(is_external),
             clean_dep("//tensorflow:windows_msvc"): get_win_copts(is_external),
             clean_dep("//tensorflow:ios"): ["-std=c++11"],
+            clean_dep("//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
             "//conditions:default": ["-pthread"]
       }))
 
@@ -309,18 +319,36 @@ def tf_binary_additional_srcs():
           clean_dep("//tensorflow:libtensorflow_framework.so"),
       ])
 
+
+# Helper functions to add kernel dependencies to tf binaries when using dynamic
+# kernel linking.
+def tf_binary_dynamic_kernel_dsos(kernels):
+  return if_dynamic_kernels(
+      extra_deps=["libtfkernel_%s.so" % clean_dep(k) for k in kernels],
+      otherwise=[])
+
+# Helper functions to add kernel dependencies to tf binaries when using static
+# kernel linking.
+def tf_binary_dynamic_kernel_deps(kernels):
+  return if_dynamic_kernels(
+      extra_deps=[],
+      otherwise=kernels)
+
 def tf_cc_shared_object(
     name,
     srcs=[],
     deps=[],
+    data=[],
     linkopts=[],
     framework_so=tf_binary_additional_srcs(),
+    kernels=[],
     **kwargs):
   native.cc_binary(
       name=name,
       srcs=srcs + framework_so,
-      deps=deps,
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels),
       linkshared = 1,
+      data = data + tf_binary_dynamic_kernel_dsos(kernels),
       linkopts=linkopts + _rpath_linkopts(name) + select({
           clean_dep("//tensorflow:darwin"): [
               "-Wl,-install_name,@rpath/" + name.split("/")[-1],
@@ -344,18 +372,21 @@ register_extension_info(
 def tf_cc_binary(name,
                  srcs=[],
                  deps=[],
+                 data=[],
                  linkopts=[],
                  copts=tf_copts(),
+                 kernels=[],
                  **kwargs):
   native.cc_binary(
       name=name,
       copts=copts,
       srcs=srcs + tf_binary_additional_srcs(),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data +  tf_binary_dynamic_kernel_dsos(kernels),
       linkopts=linkopts + _rpath_linkopts(name),
       **kwargs)
 
@@ -540,9 +571,6 @@ def tf_gen_op_wrappers_cc(name,
 #     is invalid to specify both "hidden" and "op_whitelist".
 #   cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
 #     specified ops.
-#   gen_locally: if True, the genrule to generate the Python library will be run
-#     without sandboxing. This would help when the genrule depends on symlinks
-#     which may not be supported in the sandbox.
 def tf_gen_op_wrapper_py(name,
                          out=None,
                          hidden=None,
@@ -553,8 +581,7 @@ def tf_gen_op_wrapper_py(name,
                          generated_target_name=None,
                          op_whitelist=[],
                          cc_linkopts=[],
-                         api_def_srcs=[],
-                         gen_locally=False):
+                         api_def_srcs=[]):
   if (hidden or hidden_file) and op_whitelist:
     fail('Cannot pass specify both hidden and op_whitelist.')
 
@@ -609,7 +636,6 @@ def tf_gen_op_wrapper_py(name,
         outs=[out],
         srcs=api_def_srcs + [hidden_file],
         tools=[tool_name] + tf_binary_additional_srcs(),
-        local = (1 if gen_locally else 0),
         cmd=("$(location " + tool_name + ") " + api_def_args_str +
              " @$(location " + hidden_file + ") " +
              ("1" if require_shape_functions else "0") + " > $@"))
@@ -619,7 +645,6 @@ def tf_gen_op_wrapper_py(name,
         outs=[out],
         srcs=api_def_srcs,
         tools=[tool_name] + tf_binary_additional_srcs(),
-        local = (1 if gen_locally else 0),
         cmd=("$(location " + tool_name + ") " + api_def_args_str + " " +
              op_list_arg + " " +
              ("1" if require_shape_functions else "0") + " " +
@@ -649,11 +674,13 @@ def tf_gen_op_wrapper_py(name,
 def tf_cc_test(name,
                srcs,
                deps,
+               data=[],
                linkstatic=0,
                extra_copts=[],
                suffix="",
                linkopts=[],
                nocopts=None,
+               kernels=[],
                **kwargs):
   native.cc_test(
       name="%s%s" % (name, suffix),
@@ -673,11 +700,12 @@ def tf_cc_test(name,
             "-lm"
         ],
       }) + linkopts + _rpath_linkopts(name),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
       # Nested select() statements seem not to be supported when passed to
       # linkstatic, and we already have a cuda select() passed in to this
       # function.
@@ -778,6 +806,7 @@ def tf_cuda_only_cc_test(name,
                     size="medium",
                     linkstatic=0,
                     args=[],
+                    kernels=[],
                     linkopts=[]):
   native.cc_test(
       name="%s%s" % (name, "_gpu"),
@@ -785,8 +814,8 @@ def tf_cuda_only_cc_test(name,
       size=size,
       args=args,
       copts= _cuda_copts() + tf_copts(),
-      data=data,
-      deps=deps + if_cuda([
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_cuda([
           clean_dep("//tensorflow/core:cuda"),
           clean_dep("//tensorflow/core:gpu_lib")]),
       linkopts=if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
@@ -829,9 +858,11 @@ def tf_cc_tests(srcs,
 def tf_cc_test_mkl(srcs,
                    deps,
                    name="",
+                   data=[],
                    linkstatic=0,
                    tags=[],
                    size="medium",
+                   kernels=[],
                    args=None):
   # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
   disable_header_modules = ["-use_header_modules"]
@@ -852,11 +883,12 @@ def tf_cc_test_mkl(srcs,
             "-lm"
         ],
       }) + _rpath_linkopts(src_to_test_name(src)),
-      deps=deps + if_mkl(
+      deps=deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl(
           [
               "//third_party/mkl:intel_binary_blob",
           ],
       ),
+      data=data + tf_binary_dynamic_kernel_dsos(kernels),
       linkstatic=linkstatic,
       tags=tags,
       size=size,
@@ -896,12 +928,13 @@ def tf_cuda_cc_tests(srcs,
 def tf_java_test(name,
                  srcs=[],
                  deps=[],
+                 kernels=[],
                  *args,
                  **kwargs):
   native.java_test(
       name=name,
       srcs=srcs,
-      deps=deps + tf_binary_additional_srcs(),
+      deps=deps + tf_binary_additional_srcs() + tf_binary_dynamic_kernel_dsos(kernels) + tf_binary_dynamic_kernel_deps(kernels),
       *args,
       **kwargs)
 
@@ -1075,6 +1108,15 @@ def tf_kernel_library(
       deps=deps,
       **kwargs)
 
+  # TODO(gunan): CUDA dependency not clear here. Fix it.
+  tf_cc_shared_object(
+      name="libtfkernel_%s.so" % name,
+      srcs=srcs + hdrs,
+      copts=copts,
+      deps=deps,
+      tags=["manual", "notap"])
+
+
 register_extension_info(
     extension_name = "tf_kernel_library",
     label_regex_for_dep = "{extension_name}(_gpu)?",
@@ -1351,7 +1393,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
       name=name,
       srcs=srcs,
       deps=deps + if_cuda(cuda_deps),
-      data=[name + "_check_deps"],
+      data=if_static([name + "_check_deps"]),
       copts=tf_copts(is_external=True),
       features = ["windows_export_all_symbols"],
       linkopts=linkopts + select({
@@ -1447,7 +1489,7 @@ def tf_py_wrap_cc(name,
       srcs=srcs,
       swig_includes=swig_includes,
       deps=deps + extra_deps,
-      toolchain_deps=["//tools/defaults:crosstool"],
+      toolchain_deps=["@bazel_tools//tools/cpp:current_cc_toolchain"],
       module_name=module_name,
       py_module_name=name)
   vscriptname=name+"_versionscript"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index 9dbb5d16a4e903a755c86bd0a6241180d1999f4d..c23b04b4ef85a290f055d35d0c7f0d4d8a18a2de 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 34a30c2874b90285706c9df6bec8cbbdc3451fe4..6878d28fffabc895433f97415ee71cfe8f6232c1 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\', \'center_bias\', \'pruning_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\', \'False\', \'none\'], "
   }
   member_method {
     name: "eval_dir"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 8295905975f9be76e3608c111b0e7cbe1f152a2b..65cfad77d1f3cdf682b6681fbebc950e6c1ca8a8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -266,6 +266,10 @@ tf_class {
     name: "summary"
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "symbolic_set_inputs"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
index 2cd83baf65cf4114e58f52cdc40de7e4b6df7554..2e9de9ebb21021ab82ed4409243e13db49d7327c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.activations.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "selu"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index fddac63b7817102cfc7e46d132d2871d8726c358..126ce8db6a73e2c486dbf34512812e630b3e9a32 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -366,7 +366,7 @@ tf_module {
   }
   member_method {
     name: "relu"
-    argspec: "args=[\'x\', \'alpha\', \'max_value\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'x\', \'alpha\', \'max_value\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\', \'0\'], "
   }
   member_method {
     name: "repeat"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
index c00fa79adfbe5b986b481f6c9567bafbf3abc1ae..4d3de58bd188e301ef516ac5eeae6cf0709d66da 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_value\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'max_value\', \'negative_slope\', \'threshold\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'0\', \'0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
index a97a9b57587070ec4841b627920ac91737a67997..73b577da373b1381a7e8d5841d6e002452a21f9e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "binary_accuracy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0.5\'], "
   }
   member_method {
     name: "binary_crossentropy"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 5211657414b24ad1971055446a5d021960941275..6a83129f7df46a63c8fa1080a6a35dc3f558c549 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -266,6 +266,10 @@ tf_class {
     name: "summary"
     argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "symbolic_set_inputs"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "test_on_batch"
     argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49ff85728ffab559ec706691356ce071aab89083
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorZeros.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a1b0e06b4753488bc9fcbe9aeb0d260092745f9c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -0,0 +1,130 @@
+path: "tensorflow.linalg.LinearOperatorZeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_zeros.LinearOperatorZeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_rows\', \'num_columns\', \'batch_shape\', \'dtype\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'assert_proper_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'True\', \'False\', \'True\', \'False\', \'LinearOperatorZeros\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'mat\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 3b5845f99a474ed976b91dab4f80ac2f231e7fc1..d979116887a739d2d372687fac0e5ea3b39a4b69 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -52,6 +52,10 @@ tf_module {
     name: "LinearOperatorScaledIdentity"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorZeros"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member_method {
     name: "adjoint"
     argspec: "args=[\'matrix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu
index 7591ecc04efa887ec1d35ba92881386f5a25241d..383f9545c9fc47c6f2c0213a2c07af48085461a3 100644
--- a/tensorflow/tools/ci_build/Dockerfile.gpu
+++ b/tensorflow/tools/ci_build/Dockerfile.gpu
@@ -14,6 +14,7 @@ RUN /install/install_bootstrap_deb_packages.sh
 RUN add-apt-repository -y ppa:openjdk-r/ppa && \
     add-apt-repository -y ppa:george-edison55/cmake-3.x
 RUN /install/install_deb_packages.sh
+
 RUN /install/install_pip_packages.sh
 RUN /install/install_bazel.sh
 RUN /install/install_golang.sh
@@ -22,6 +23,11 @@ RUN /install/install_golang.sh
 COPY install/.bazelrc /etc/bazel.bazelrc
 ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
 
+# Link NCCL libray and header where the build script expects them.
+RUN mkdir /usr/local/cuda-9.0/lib &&  \
+    ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
+    ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
+
 # Configure the build for our CUDA configuration.
 ENV TF_NEED_CUDA 1
 ENV TF_CUDA_COMPUTE_CAPABILITIES 3.0
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 08e2c3edd2d22fbb7b9912c9ce7ec561dc5a7113..5115be8c6d0c9cf1f5319256c20bc1f7ab01bad5 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -150,36 +150,7 @@ BAZEL_TARGET="//tensorflow/... -//tensorflow/compiler/..."
 if [[ -n "$TF_SKIP_CONTRIB_TESTS" ]]; then
   BAZEL_TARGET="$BAZEL_TARGET -//tensorflow/contrib/..."
 else
-  BAZEL_TARGET="${BAZEL_TARGET} -//tensorflow/contrib/lite/..."
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:context_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:framework"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:interpreter_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:model_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/toco:toco"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:simple_memory_arena_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite:string_util_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:activations_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:add_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:basic_rnn_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:concatenation_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:conv_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:depthwise_conv_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:fully_connected_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:hashtable_lookup_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:local_response_norm_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lsh_projection_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lstm_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:l2norm_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:mul_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:pooling_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:reshape_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:resize_bilinear_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:skip_gram_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:softmax_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:space_to_depth_test"
-  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:svdf_test"
+  BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/..."
 fi
 
 TUT_TEST_DATA_DIR="/tmp/tf_tutorial_test_data"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index db37edf8097844646236aace5e3517a8080d70cb..866fe95d2b4b358b63b14b8744eb631a58e18b49 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -354,7 +354,7 @@ do_external_licenses_check(){
 
   # Whitelist
   echo ${EXTRA_LICENSE_FILE}
-  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -v ${EXTRA_LICENSES_FILE} > temp.txt
+  grep -e "@bazel_tools//src" -e "@bazel_tools//tools/" -e "@com_google_absl//" -e "//external" -e "@local" -e "@com_github_googlecloudplatform_google_cloud_cpp//" -e "@embedded_jdk//" -v ${EXTRA_LICENSES_FILE} > temp.txt
   mv temp.txt ${EXTRA_LICENSES_FILE}
 
 
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index adbff8f6ef1432f541fac53f710cec23130e0987..e284401b8aa469ebcbed856cd09dd597be242d7a 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 9d24b3e4213e708a5b635a27028f78c6c4b178e7..87be81577d0efb395a12afc85109f10ad4178c27 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="0.14.1"
+BAZEL_VERSION="0.15.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index 2b68de3c5b9bbb0c09ddead7466049827fac4147..f6fa9251d43074e119ea0eacb721727cec953c0c 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -34,35 +34,4 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
     --test_size_filters=small,medium --test_output=errors -- \
-    //tensorflow/contrib/... \
-    -//tensorflow/contrib/lite/... \
-    //tensorflow/contrib/lite:context_test \
-    //tensorflow/contrib/lite:framework \
-    //tensorflow/contrib/lite:interpreter_test \
-    //tensorflow/contrib/lite:model_test \
-    //tensorflow/contrib/lite/toco:toco \
-    //tensorflow/contrib/lite:simple_memory_arena_test \
-    //tensorflow/contrib/lite:string_util_test \
-    //tensorflow/contrib/lite/kernels:activations_test \
-    //tensorflow/contrib/lite/kernels:add_test \
-    //tensorflow/contrib/lite/kernels:basic_rnn_test \
-    //tensorflow/contrib/lite/kernels:concatenation_test \
-    //tensorflow/contrib/lite/kernels:conv_test \
-    //tensorflow/contrib/lite/kernels:depthwise_conv_test \
-    //tensorflow/contrib/lite/kernels:embedding_lookup_test \
-    //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test \
-    //tensorflow/contrib/lite/kernels:fully_connected_test \
-    //tensorflow/contrib/lite/testing:generated_zip_tests \
-    //tensorflow/contrib/lite/kernels:hashtable_lookup_test \
-    //tensorflow/contrib/lite/kernels:local_response_norm_test \
-    //tensorflow/contrib/lite/kernels:lsh_projection_test \
-    //tensorflow/contrib/lite/kernels:lstm_test \
-    //tensorflow/contrib/lite/kernels:l2norm_test \
-    //tensorflow/contrib/lite/kernels:mul_test \
-    //tensorflow/contrib/lite/kernels:pooling_test \
-    //tensorflow/contrib/lite/kernels:reshape_test \
-    //tensorflow/contrib/lite/kernels:resize_bilinear_test \
-    //tensorflow/contrib/lite/kernels:skip_gram_test \
-    //tensorflow/contrib/lite/kernels:softmax_test \
-    //tensorflow/contrib/lite/kernels:space_to_depth_test \
-    //tensorflow/contrib/lite/kernels:svdf_test
+    //tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
index ad22ebe4eb304fe6b6f8613f43f2c7c001111503..a1d91a61237eb606337a7f95c1824662697ca69f 100755
--- a/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
+++ b/tensorflow/tools/ci_build/linux/mkl/build-dev-container.sh
@@ -34,12 +34,17 @@ echo "TF_DOCKER_BUILD_DEVEL_BRANCH=${TF_DOCKER_BUILD_DEVEL_BRANCH}"
 echo "TF_DOCKER_BUILD_IMAGE_NAME=${TF_DOCKER_BUILD_IMAGE_NAME}"
 echo "TF_DOCKER_BUILD_VERSION=${TF_DOCKER_BUILD_VERSION}"
 
+# Build containers for AVX
+# Include the instructions for sandybridge and later, but tune for ivybridge
+TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=sandybridge --copt=-mtune=ivybridge --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+
 # build the python 2 container and whl
 TF_DOCKER_BUILD_TYPE="MKL" \
   TF_DOCKER_BUILD_IS_DEVEL="YES" \
   TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
   TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
   TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh 
 
 # build the python 3 container and whl
@@ -49,5 +54,29 @@ TF_DOCKER_BUILD_TYPE="MKL" \
   TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
   TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}" \
   TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
+# Build containers for AVX2
+# Include the instructions for haswell and later, but tune for broadwell
+TF_BAZEL_BUILD_OPTIONS="--config=mkl --copt=-march=haswell --copt=-mtune=broadwell --copt=-O3 --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+
+# build the python 2 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
   ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh 
 
+# build the python 3 container and whl
+TF_DOCKER_BUILD_TYPE="MKL" \
+  TF_DOCKER_BUILD_IS_DEVEL="YES" \
+  TF_DOCKER_BUILD_DEVEL_BRANCH="${TF_DOCKER_BUILD_DEVEL_BRANCH}" \
+  TF_DOCKER_BUILD_IMAGE_NAME="${TF_DOCKER_BUILD_IMAGE_NAME}" \
+  TF_DOCKER_BUILD_VERSION="${TF_DOCKER_BUILD_VERSION}-avx2" \
+  TF_DOCKER_BUILD_PYTHON_VERSION="PYTHON3" \
+  TF_BAZEL_BUILD_OPTIONS="${TF_BAZEL_BUILD_OPTIONS}" \
+  ${WORKSPACE}/tensorflow/tools/docker/parameterized_docker_build.sh
+
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index c03cbd9c66a1d12185920290d4dc3cd52e4a616c..0482cf619a831ebb87e76cd18efbdac83a0d2f11 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -33,10 +33,10 @@ function set_remote_cache_options {
   echo "build --tls_enabled=true" >> "${TMP_BAZELRC}"
   echo "build --remote_timeout=3600" >> "${TMP_BAZELRC}"
   echo "build --auth_enabled=true" >> "${TMP_BAZELRC}"
-  echo "build --spawn_strategy=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Javac=remote" >> "${TMP_BAZELRC}"
-  echo "build --strategy=Closure=remote" >> "${TMP_BAZELRC}"
-  echo "build --genrule_strategy=remote" >> "${TMP_BAZELRC}"
+  echo "build --spawn_strategy=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Javac=standalone" >> "${TMP_BAZELRC}"
+  echo "build --strategy=Closure=standalone" >> "${TMP_BAZELRC}"
+  echo "build --genrule_strategy=standalone" >> "${TMP_BAZELRC}"
   echo "build --google_credentials=$GOOGLE_CLOUD_CREDENTIAL" >> "${TMP_BAZELRC}"
 }
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 61dec249f3390875518b4fd3fc5b5a3bca53aae1..47e0e5dd59af76c733e7f2271294ad0e5c7e6b26 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -53,30 +53,39 @@ function cleanup {
 }
 trap cleanup EXIT
 
-skip_test=0
-release_build=0
+PY_TEST_DIR="py_test_dir"
 
+SKIP_TEST=0
+RELEASE_BUILD=0
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
+    //${PY_TEST_DIR}/tensorflow/contrib/... "
+
+# --skip_test            Skip running tests
+# --enable_remote_cache  Add options to enable remote cache for build and test
+# --release_build        Build for release, compilation time will be longer to
+#                        ensure performance
+# --test_core_only       Use tensorflow/python/... as test target
+# --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
-  if [[ "$ARG" == --skip_test ]]; then
-    skip_test=1
-  elif [[ "$ARG" == --enable_remote_cache ]]; then
-    set_remote_cache_options
-  elif [[ "$ARG" == --release_build ]]; then
-    release_build=1
-  fi
+  case "$ARG" in
+    --skip_test) SKIP_TEST=1 ;;
+    --enable_remote_cache) set_remote_cache_options ;;
+    --release_build) RELEASE_BUILD=1 ;;
+    --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
+    --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    *)
+  esac
 done
 
-if [[ "$release_build" != 1 ]]; then
-  # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+if [[ "$RELEASE_BUILD" == 1 ]]; then
+  # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-  # Because this hurts the performance of TF, we don't enable it in release build.
-  echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+  # Because this hurts the performance of TF, we don't override it in release build.
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=0
+else
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=1
 fi
 
-# The host and target platforms are the same in Windows build. So we don't have
-# to distinct them. This helps avoid building the same targets twice.
-echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
-
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
@@ -88,12 +97,11 @@ run_configure_for_cpu_build
 
 bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
-if [[ "$skip_test" == 1 ]]; then
+if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
 fi
 
 # Create a python test directory to avoid package name conflict
-PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
@@ -114,5 +122,4 @@ bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_size_filters=small,medium \
   --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
-  //${PY_TEST_DIR}/tensorflow/python/... \
-  //${PY_TEST_DIR}/tensorflow/contrib/...
+  ${TEST_TARGET}
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index e232306653428dcaf5d15ec48570e7b99c574981..e3eee110808ce6cf1905a44b9108ad6de49f10cb 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -53,30 +53,39 @@ function cleanup {
 }
 trap cleanup EXIT
 
-skip_test=0
-release_build=0
+PY_TEST_DIR="py_test_dir"
 
+SKIP_TEST=0
+RELEASE_BUILD=0
+TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/... \
+  //${PY_TEST_DIR}/tensorflow/contrib/... "
+
+# --skip_test            Skip running tests
+# --enable_remote_cache  Add options to enable remote cache for build and test
+# --release_build        Build for release, compilation time will be longer to
+#                        ensure performance
+# --test_core_only       Use tensorflow/python/... as test target
+# --test_contrib_only    Use tensorflow/contrib/... as test target
 for ARG in "$@"; do
-  if [[ "$ARG" == --skip_test ]]; then
-    skip_test=1
-  elif [[ "$ARG" == --enable_remote_cache ]]; then
-    set_remote_cache_options
-  elif [[ "$ARG" == --release_build ]]; then
-    release_build=1
-  fi
+  case "$ARG" in
+    --skip_test) SKIP_TEST=1 ;;
+    --enable_remote_cache) set_remote_cache_options ;;
+    --release_build) RELEASE_BUILD=1 ;;
+    --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
+    --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    *)
+  esac
 done
 
-if [[ "$release_build" != 1 ]]; then
-  # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+if [[ "$RELEASE_BUILD" == 1 ]]; then
+  # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
   # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-  # Because this hurts the performance of TF, we don't enable it in release build.
-  echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+  # Because this hurts the performance of TF, we don't override it in release build.
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=0
+else
+  export TF_OVERRIDE_EIGEN_STRONG_INLINE=1
 fi
 
-# The host and target platforms are the same in Windows build. So we don't have
-# to distinct them. This helps avoid building the same targets twice.
-echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
-
 # Enable short object file path to avoid long path issue on Windows.
 echo "startup --output_user_root=${TMPDIR}" >> "${TMP_BAZELRC}"
 
@@ -91,12 +100,11 @@ run_configure_for_gpu_build
 
 bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
-if [[ "$skip_test" == 1 ]]; then
+if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
 fi
 
 # Create a python test directory to avoid package name conflict
-PY_TEST_DIR="py_test_dir"
 create_python_test_dir "${PY_TEST_DIR}"
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}"
@@ -119,5 +127,4 @@ bazel test --announce_rc --config=opt -k --test_output=errors \
   --test_size_filters=small,medium \
   --local_test_jobs=$TF_GPU_COUNT --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
-  //${PY_TEST_DIR}/tensorflow/python/... \
-  //${PY_TEST_DIR}/tensorflow/contrib/...
+  ${TEST_TARGET}
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index b7bfb29aae4fcaa55e01ba924f72cf79d2b09ad1..55792c51fe87f0ded92730c13409169f6c67d035 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -8,10 +8,17 @@ load(
     "tf_cc_test",  # @unused
 )
 
+py_library(
+    name = "ast_edits",
+    srcs = ["ast_edits.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
     srcs_version = "PY2AND3",
+    deps = [":ast_edits"],
 )
 
 py_test(
@@ -26,6 +33,28 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "tf_upgrade_v2",
+    srcs = [
+        "renames_v2.py",
+        "tf_upgrade_v2.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [":ast_edits"],
+)
+
+py_test(
+    name = "tf_upgrade_v2_test",
+    srcs = ["tf_upgrade_v2_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tf_upgrade_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 # Keep for reference, this test will succeed in 0.11 but fail in 1.0
 # py_test(
 #     name = "test_file_v0_11",
@@ -62,9 +91,37 @@ py_test(
     ],
 )
 
+genrule(
+    name = "generate_upgraded_file_v2",
+    testonly = 1,
+    srcs = ["testdata/test_file_v1_10.py"],
+    outs = [
+        "test_file_v2_0.py",
+        "report_v2.txt",
+    ],
+    cmd = ("$(location :tf_upgrade_v2)" +
+           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --outfile $(location test_file_v2_0.py)" +
+           " --reportfile $(location report_v2.txt)"),
+    tools = [":tf_upgrade_v2"],
+)
+
+py_test(
+    name = "test_file_v2_0",
+    size = "small",
+    srcs = ["test_file_v2_0.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 exports_files(
     [
+        "ast_edits.py",
         "tf_upgrade.py",
+        "renames_v2.py",
         "testdata/test_file_v0_11.py",
+        "testdata/test_file_v1_10.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..216aa41b60eb566db37244b72cbeef024546607f
--- /dev/null
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+This file should be updated whenever endpoints are deprecated.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+renames = {
+    'tf.acos': 'tf.math.acos',
+    'tf.acosh': 'tf.math.acosh',
+    'tf.add': 'tf.math.add',
+    'tf.as_string': 'tf.dtypes.as_string',
+    'tf.asin': 'tf.math.asin',
+    'tf.asinh': 'tf.math.asinh',
+    'tf.atan': 'tf.math.atan',
+    'tf.atan2': 'tf.math.atan2',
+    'tf.atanh': 'tf.math.atanh',
+    'tf.batch_to_space_nd': 'tf.manip.batch_to_space_nd',
+    'tf.betainc': 'tf.math.betainc',
+    'tf.ceil': 'tf.math.ceil',
+    'tf.check_numerics': 'tf.debugging.check_numerics',
+    'tf.cholesky': 'tf.linalg.cholesky',
+    'tf.cos': 'tf.math.cos',
+    'tf.cosh': 'tf.math.cosh',
+    'tf.cross': 'tf.linalg.cross',
+    'tf.decode_base64': 'tf.io.decode_base64',
+    'tf.decode_compressed': 'tf.io.decode_compressed',
+    'tf.decode_json_example': 'tf.io.decode_json_example',
+    'tf.decode_raw': 'tf.io.decode_raw',
+    'tf.dequantize': 'tf.quantization.dequantize',
+    'tf.diag': 'tf.linalg.tensor_diag',
+    'tf.diag_part': 'tf.linalg.tensor_diag_part',
+    'tf.digamma': 'tf.math.digamma',
+    'tf.encode_base64': 'tf.io.encode_base64',
+    'tf.equal': 'tf.math.equal',
+    'tf.erfc': 'tf.math.erfc',
+    'tf.exp': 'tf.math.exp',
+    'tf.expm1': 'tf.math.expm1',
+    'tf.extract_image_patches': 'tf.image.extract_image_patches',
+    'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
+    'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
+    'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
+    'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
+    'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
+    'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
+    'tf.fft': 'tf.spectral.fft',
+    'tf.floor': 'tf.math.floor',
+    'tf.gather_nd': 'tf.manip.gather_nd',
+    'tf.greater': 'tf.math.greater',
+    'tf.greater_equal': 'tf.math.greater_equal',
+    'tf.ifft': 'tf.spectral.ifft',
+    'tf.igamma': 'tf.math.igamma',
+    'tf.igammac': 'tf.math.igammac',
+    'tf.invert_permutation': 'tf.math.invert_permutation',
+    'tf.is_finite': 'tf.debugging.is_finite',
+    'tf.is_inf': 'tf.debugging.is_inf',
+    'tf.is_nan': 'tf.debugging.is_nan',
+    'tf.less': 'tf.math.less',
+    'tf.less_equal': 'tf.math.less_equal',
+    'tf.lgamma': 'tf.math.lgamma',
+    'tf.log': 'tf.math.log',
+    'tf.log1p': 'tf.math.log1p',
+    'tf.logical_and': 'tf.math.logical_and',
+    'tf.logical_not': 'tf.math.logical_not',
+    'tf.logical_or': 'tf.math.logical_or',
+    'tf.matching_files': 'tf.io.matching_files',
+    'tf.matrix_band_part': 'tf.linalg.band_part',
+    'tf.matrix_determinant': 'tf.linalg.det',
+    'tf.matrix_diag': 'tf.linalg.diag',
+    'tf.matrix_diag_part': 'tf.linalg.diag_part',
+    'tf.matrix_inverse': 'tf.linalg.inv',
+    'tf.matrix_set_diag': 'tf.linalg.set_diag',
+    'tf.matrix_solve': 'tf.linalg.solve',
+    'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.maximum': 'tf.math.maximum',
+    'tf.minimum': 'tf.math.minimum',
+    'tf.not_equal': 'tf.math.not_equal',
+    'tf.parse_tensor': 'tf.io.parse_tensor',
+    'tf.polygamma': 'tf.math.polygamma',
+    'tf.qr': 'tf.linalg.qr',
+    'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.read_file': 'tf.io.read_file',
+    'tf.reciprocal': 'tf.math.reciprocal',
+    'tf.regex_replace': 'tf.strings.regex_replace',
+    'tf.reshape': 'tf.manip.reshape',
+    'tf.reverse': 'tf.manip.reverse',
+    'tf.reverse_v2': 'tf.manip.reverse',
+    'tf.rint': 'tf.math.rint',
+    'tf.rsqrt': 'tf.math.rsqrt',
+    'tf.scatter_nd': 'tf.manip.scatter_nd',
+    'tf.segment_max': 'tf.math.segment_max',
+    'tf.segment_mean': 'tf.math.segment_mean',
+    'tf.segment_min': 'tf.math.segment_min',
+    'tf.segment_prod': 'tf.math.segment_prod',
+    'tf.segment_sum': 'tf.math.segment_sum',
+    'tf.sin': 'tf.math.sin',
+    'tf.sinh': 'tf.math.sinh',
+    'tf.space_to_batch_nd': 'tf.manip.space_to_batch_nd',
+    'tf.squared_difference': 'tf.math.squared_difference',
+    'tf.string_join': 'tf.strings.join',
+    'tf.string_strip': 'tf.strings.strip',
+    'tf.string_to_hash_bucket': 'tf.strings.to_hash_bucket',
+    'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
+    'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
+    'tf.string_to_number': 'tf.strings.to_number',
+    'tf.substr': 'tf.strings.substr',
+    'tf.tan': 'tf.math.tan',
+    'tf.tile': 'tf.manip.tile',
+    'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
+    'tf.unsorted_segment_min': 'tf.math.unsorted_segment_min',
+    'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
+    'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
+    'tf.write_file': 'tf.io.write_file',
+    'tf.zeta': 'tf.math.zeta'
+}
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
new file mode 100644
index 0000000000000000000000000000000000000000..a49035a1a09bb6b6ea33a375766c9c414f871df1
--- /dev/null
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
@@ -0,0 +1,34 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0."""
+
+  def testRenames(self):
+    with self.test_session():
+      self.assertAllClose(1.04719755, tf.acos(0.5).eval())
+      self.assertAllClose(0.5, tf.rsqrt(4.0).eval())
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py
index 1f8833582af4c922115e637117e775e619439786..96705b1a4c27e72ba1d50f16dad10c35705b1782 100644
--- a/tensorflow/tools/compatibility/tf_upgrade.py
+++ b/tensorflow/tools/compatibility/tf_upgrade.py
@@ -19,491 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import ast
-import collections
-import os
-import shutil
-import sys
-import tempfile
-import traceback
 
+from tensorflow.tools.compatibility import ast_edits
 
-class APIChangeSpec(object):
-  """This class defines the transformations that need to happen.
 
-  This class must provide the following fields:
-
-  * `function_keyword_renames`: maps function names to a map of old -> new
-    argument names
-  * `function_renames`: maps function names to new function names
-  * `change_to_function`: a set of function names that have changed (for
-    notifications)
-  * `function_reorders`: maps functions whose argument order has changed to the
-    list of arguments in the new order
-  * `function_handle`: maps function names to custom handlers for the function
-
-  For an example, see `TFAPIChangeSpec`.
-  """
-
-
-class _FileEditTuple(
-    collections.namedtuple("_FileEditTuple",
-                           ["comment", "line", "start", "old", "new"])):
-  """Each edit that is recorded by a _FileEditRecorder.
-
-  Fields:
-    comment: A description of the edit and why it was made.
-    line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
-    old: text string to remove (this must match what was in file).
-    new: text string to add in place of `old`.
-  """
-
-  __slots__ = ()
-
-
-class _FileEditRecorder(object):
-  """Record changes that need to be done to the file."""
-
-  def __init__(self, filename):
-    # all edits are lists of chars
-    self._filename = filename
-
-    self._line_to_edit = collections.defaultdict(list)
-    self._errors = []
-
-  def process(self, text):
-    """Process a list of strings, each corresponding to the recorded changes.
-
-    Args:
-      text: A list of lines of text (assumed to contain newlines)
-    Returns:
-      A tuple of the modified text and a textual description of what is done.
-    Raises:
-      ValueError: if substitution source location does not have expected text.
-    """
-
-    change_report = ""
-
-    # Iterate of each line
-    for line, edits in self._line_to_edit.items():
-      offset = 0
-      # sort by column so that edits are processed in order in order to make
-      # indexing adjustments cumulative for changes that change the string
-      # length
-      edits.sort(key=lambda x: x.start)
-
-      # Extract each line to a list of characters, because mutable lists
-      # are editable, unlike immutable strings.
-      char_array = list(text[line - 1])
-
-      # Record a description of the change
-      change_report += "%r Line %d\n" % (self._filename, line)
-      change_report += "-" * 80 + "\n\n"
-      for e in edits:
-        change_report += "%s\n" % e.comment
-      change_report += "\n    Old: %s" % (text[line - 1])
-
-      # Make underscore buffers for underlining where in the line the edit was
-      change_list = [" "] * len(text[line - 1])
-      change_list_new = [" "] * len(text[line - 1])
-
-      # Iterate for each edit
-      for e in edits:
-        # Create effective start, end by accounting for change in length due
-        # to previous edits
-        start_eff = e.start + offset
-        end_eff = start_eff + len(e.old)
-
-        # Make sure the edit is changing what it should be changing
-        old_actual = "".join(char_array[start_eff:end_eff])
-        if old_actual != e.old:
-          raise ValueError("Expected text %r but got %r" %
-                           ("".join(e.old), "".join(old_actual)))
-        # Make the edit
-        char_array[start_eff:end_eff] = list(e.new)
-
-        # Create the underline highlighting of the before and after
-        change_list[e.start:e.start + len(e.old)] = "~" * len(e.old)
-        change_list_new[start_eff:end_eff] = "~" * len(e.new)
-
-        # Keep track of how to generate effective ranges
-        offset += len(e.new) - len(e.old)
-
-      # Finish the report comment
-      change_report += "         %s\n" % "".join(change_list)
-      text[line - 1] = "".join(char_array)
-      change_report += "    New: %s" % (text[line - 1])
-      change_report += "         %s\n\n" % "".join(change_list_new)
-    return "".join(text), change_report, self._errors
-
-  def add(self, comment, line, start, old, new, error=None):
-    """Add a new change that is needed.
-
-    Args:
-      comment: A description of what was changed
-      line: Line number (1 indexed)
-      start: Column offset (0 indexed)
-      old: old text
-      new: new text
-      error: this "edit" is something that cannot be fixed automatically
-    Returns:
-      None
-    """
-
-    self._line_to_edit[line].append(
-        _FileEditTuple(comment, line, start, old, new))
-    if error:
-      self._errors.append("%s:%d: %s" % (self._filename, line, error))
-
-
-class _ASTCallVisitor(ast.NodeVisitor):
-  """AST Visitor that processes function calls.
-
-  Updates function calls from old API version to new API version using a given
-  change spec.
-  """
-
-  def __init__(self, filename, lines, api_change_spec):
-    self._filename = filename
-    self._file_edit = _FileEditRecorder(filename)
-    self._lines = lines
-    self._api_change_spec = api_change_spec
-
-  def process(self, lines):
-    return self._file_edit.process(lines)
-
-  def generic_visit(self, node):
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def _rename_functions(self, node, full_name):
-    function_renames = self._api_change_spec.function_renames
-    try:
-      new_name = function_renames[full_name]
-      self._file_edit.add("Renamed function %r to %r" % (full_name, new_name),
-                          node.lineno, node.col_offset, full_name, new_name)
-    except KeyError:
-      pass
-
-  def _get_attribute_full_path(self, node):
-    """Traverse an attribute to generate a full name e.g. tf.foo.bar.
-
-    Args:
-      node: A Node of type Attribute.
-
-    Returns:
-      a '.'-delimited full-name or None if the tree was not a simple form.
-      i.e. `foo()+b).bar` returns None, while `a.b.c` would return "a.b.c".
-    """
-    curr = node
-    items = []
-    while not isinstance(curr, ast.Name):
-      if not isinstance(curr, ast.Attribute):
-        return None
-      items.append(curr.attr)
-      curr = curr.value
-    items.append(curr.id)
-    return ".".join(reversed(items))
-
-  def _find_true_position(self, node):
-    """Return correct line number and column offset for a given node.
-
-    This is necessary mainly because ListComp's location reporting reports
-    the next token after the list comprehension list opening.
-
-    Args:
-      node: Node for which we wish to know the lineno and col_offset
-    """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
-    if isinstance(node, ast.ListComp):
-      # Strangely, ast.ListComp returns the col_offset of the first token
-      # after the '[' token which appears to be a bug. Workaround by
-      # explicitly finding the real start of the list comprehension.
-      line = node.lineno
-      col = node.col_offset
-      # loop over lines
-      while 1:
-        # Reverse the text to and regular expression search for whitespace
-        text = self._lines[line - 1]
-        reversed_preceding_text = text[:col][::-1]
-        # First find if a [ can be found with only whitespace between it and
-        # col.
-        m = find_open.match(reversed_preceding_text)
-        if m:
-          new_col_offset = col - m.start(1) - 1
-          return line, new_col_offset
-        else:
-          if (reversed_preceding_text == "" or
-              reversed_preceding_text.isspace()):
-            line = line - 1
-            prev_line = self._lines[line - 1]
-            # TODO(aselle):
-            # this is poor comment detection, but it is good enough for
-            # cases where the comment does not contain string literal starting/
-            # ending characters. If ast gave us start and end locations of the
-            # ast nodes rather than just start, we could use string literal
-            # node ranges to filter out spurious #'s that appear in string
-            # literals.
-            comment_start = prev_line.find("#")
-            if comment_start == -1:
-              col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
-              col = comment_start
-            else:
-              return None, None
-          else:
-            return None, None
-    # Most other nodes return proper locations (with notably does not), but
-    # it is not possible to use that in an argument.
-    return node.lineno, node.col_offset
-
-  def visit_Call(self, node):  # pylint: disable=invalid-name
-    """Handle visiting a call node in the AST.
-
-    Args:
-      node: Current Node
-    """
-
-    # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
-
-    # Make sure the func is marked as being part of a call
-    node.func.is_function_for_call = True
-
-    if full_name:
-      # Call special handlers
-      function_handles = self._api_change_spec.function_handle
-      if full_name in function_handles:
-        function_handles[full_name](self._file_edit, node)
-
-      # Examine any non-keyword argument and make it into a keyword argument
-      # if reordering required.
-      function_reorders = self._api_change_spec.function_reorders
-      function_keyword_renames = (
-          self._api_change_spec.function_keyword_renames)
-
-      if full_name in function_reorders:
-        reordered = function_reorders[full_name]
-        for idx, arg in enumerate(node.args):
-          lineno, col_offset = self._find_true_position(arg)
-          if lineno is None or col_offset is None:
-            self._file_edit.add(
-                "Failed to add keyword %r to reordered function %r" %
-                (reordered[idx], full_name),
-                arg.lineno,
-                arg.col_offset,
-                "",
-                "",
-                error="A necessary keyword argument failed to be inserted.")
-          else:
-            keyword_arg = reordered[idx]
-            if (full_name in function_keyword_renames and
-                keyword_arg in function_keyword_renames[full_name]):
-              keyword_arg = function_keyword_renames[full_name][keyword_arg]
-            self._file_edit.add("Added keyword %r to reordered function %r" %
-                                (reordered[idx], full_name), lineno, col_offset,
-                                "", keyword_arg + "=")
-
-      # Examine each keyword argument and convert it to the final renamed form
-      renamed_keywords = ({} if full_name not in function_keyword_renames else
-                          function_keyword_renames[full_name])
-      for keyword in node.keywords:
-        argkey = keyword.arg
-        argval = keyword.value
-
-        if argkey in renamed_keywords:
-          argval_lineno, argval_col_offset = self._find_true_position(argval)
-          if argval_lineno is not None and argval_col_offset is not None:
-            # TODO(aselle): We should scan backward to find the start of the
-            # keyword key. Unfortunately ast does not give you the location of
-            # keyword keys, so we are forced to infer it from the keyword arg
-            # value.
-            key_start = argval_col_offset - len(argkey) - 1
-            key_end = key_start + len(argkey) + 1
-            if (self._lines[argval_lineno - 1][key_start:key_end] == argkey +
-                "="):
-              self._file_edit.add("Renamed keyword argument from %r to %r" %
-                                  (argkey,
-                                   renamed_keywords[argkey]), argval_lineno,
-                                  argval_col_offset - len(argkey) - 1,
-                                  argkey + "=", renamed_keywords[argkey] + "=")
-              continue
-          self._file_edit.add(
-              "Failed to rename keyword argument from %r to %r" %
-              (argkey, renamed_keywords[argkey]),
-              argval.lineno,
-              argval.col_offset - len(argkey) - 1,
-              "",
-              "",
-              error="Failed to find keyword lexographically. Fix manually.")
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-  def visit_Attribute(self, node):  # pylint: disable=invalid-name
-    """Handle bare Attributes i.e. [tf.foo, tf.bar].
-
-    Args:
-      node: Node that is of type ast.Attribute
-    """
-    full_name = self._get_attribute_full_path(node)
-    if full_name:
-      self._rename_functions(node, full_name)
-    if full_name in self._api_change_spec.change_to_function:
-      if not hasattr(node, "is_function_for_call"):
-        new_text = full_name + "()"
-        self._file_edit.add("Changed %r to %r" % (full_name, new_text),
-                            node.lineno, node.col_offset, full_name, new_text)
-
-    ast.NodeVisitor.generic_visit(self, node)
-
-
-class ASTCodeUpgrader(object):
-  """Handles upgrading a set of Python files using a given API change spec."""
-
-  def __init__(self, api_change_spec):
-    if not isinstance(api_change_spec, APIChangeSpec):
-      raise TypeError("Must pass APIChangeSpec to ASTCodeUpgrader, got %s" %
-                      type(api_change_spec))
-    self._api_change_spec = api_change_spec
-
-  def process_file(self, in_filename, out_filename):
-    """Process the given python file for incompatible changes.
-
-    Args:
-      in_filename: filename to parse
-      out_filename: output file to write to
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-
-    # Write to a temporary file, just in case we are doing an implace modify.
-    with open(in_filename, "r") as in_file, \
-        tempfile.NamedTemporaryFile("w", delete=False) as temp_file:
-      ret = self.process_opened_file(in_filename, in_file, out_filename,
-                                     temp_file)
-
-    shutil.move(temp_file.name, out_filename)
-    return ret
-
-  # Broad exceptions are required here because ast throws whatever it wants.
-  # pylint: disable=broad-except
-  def process_opened_file(self, in_filename, in_file, out_filename, out_file):
-    """Process the given python file for incompatible changes.
-
-    This function is split out to facilitate StringIO testing from
-    tf_upgrade_test.py.
-
-    Args:
-      in_filename: filename to parse
-      in_file: opened file (or StringIO)
-      out_filename: output file to write to
-      out_file: opened file (or StringIO)
-    Returns:
-      A tuple representing number of files processed, log of actions, errors
-    """
-    process_errors = []
-    text = "-" * 80 + "\n"
-    text += "Processing file %r\n outputting to %r\n" % (in_filename,
-                                                         out_filename)
-    text += "-" * 80 + "\n\n"
-
-    parsed_ast = None
-    lines = in_file.readlines()
-    try:
-      parsed_ast = ast.parse("".join(lines))
-    except Exception:
-      text += "Failed to parse %r\n\n" % in_filename
-      text += traceback.format_exc()
-    if parsed_ast:
-      visitor = _ASTCallVisitor(in_filename, lines, self._api_change_spec)
-      visitor.visit(parsed_ast)
-      out_text, new_text, process_errors = visitor.process(lines)
-      text += new_text
-      if out_file:
-        out_file.write(out_text)
-    text += "\n"
-    return 1, text, process_errors
-
-  # pylint: enable=broad-except
-
-  def process_tree(self, root_directory, output_root_directory,
-                   copy_other_files):
-    """Processes upgrades on an entire tree of python files in place.
-
-    Note that only Python files. If you have custom code in other languages,
-    you will need to manually upgrade those.
-
-    Args:
-      root_directory: Directory to walk and process.
-      output_root_directory: Directory to use as base.
-      copy_other_files: Copy files that are not touched by this converter.
-
-    Returns:
-      A tuple of files processed, the report string ofr all files, and errors
-    """
-
-    # make sure output directory doesn't exist
-    if output_root_directory and os.path.exists(output_root_directory):
-      print("Output directory %r must not already exist." %
-            (output_root_directory))
-      sys.exit(1)
-
-    # make sure output directory does not overlap with root_directory
-    norm_root = os.path.split(os.path.normpath(root_directory))
-    norm_output = os.path.split(os.path.normpath(output_root_directory))
-    if norm_root == norm_output:
-      print("Output directory %r same as input directory %r" %
-            (root_directory, output_root_directory))
-      sys.exit(1)
-
-    # Collect list of files to process (we do this to correctly handle if the
-    # user puts the output directory in some sub directory of the input dir)
-    files_to_process = []
-    files_to_copy = []
-    for dir_name, _, file_list in os.walk(root_directory):
-      py_files = [f for f in file_list if f.endswith(".py")]
-      copy_files = [f for f in file_list if not f.endswith(".py")]
-      for filename in py_files:
-        fullpath = os.path.join(dir_name, filename)
-        fullpath_output = os.path.join(output_root_directory,
-                                       os.path.relpath(fullpath,
-                                                       root_directory))
-        files_to_process.append((fullpath, fullpath_output))
-      if copy_other_files:
-        for filename in copy_files:
-          fullpath = os.path.join(dir_name, filename)
-          fullpath_output = os.path.join(output_root_directory,
-                                         os.path.relpath(
-                                             fullpath, root_directory))
-          files_to_copy.append((fullpath, fullpath_output))
-
-    file_count = 0
-    tree_errors = []
-    report = ""
-    report += ("=" * 80) + "\n"
-    report += "Input tree: %r\n" % root_directory
-    report += ("=" * 80) + "\n"
-
-    for input_path, output_path in files_to_process:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      file_count += 1
-      _, l_report, l_errors = self.process_file(input_path, output_path)
-      tree_errors += l_errors
-      report += l_report
-    for input_path, output_path in files_to_copy:
-      output_directory = os.path.dirname(output_path)
-      if not os.path.isdir(output_directory):
-        os.makedirs(output_directory)
-      shutil.copy(input_path, output_path)
-    return file_count, report, tree_errors
-
-
-class TFAPIChangeSpec(APIChangeSpec):
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
   """List of maps that describe what changed in the API."""
 
   def __init__(self):
@@ -718,7 +238,7 @@ Simple usage:
       default="report.txt")
   args = parser.parse_args()
 
-  upgrade = ASTCodeUpgrader(TFAPIChangeSpec())
+  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
   report_text = None
   report_filename = args.report_filename
   files_processed = 0
diff --git a/tensorflow/tools/compatibility/tf_upgrade_test.py b/tensorflow/tools/compatibility/tf_upgrade_test.py
index 3d02eacba6e7a91e6d3c88e8297306de9782f4bf..66325ea2ad36265c6c3779b414774abab8213a84 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_test.py
@@ -22,6 +22,7 @@ import tempfile
 import six
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade
 
 
@@ -36,7 +37,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
-    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     count, report, errors = (
         upgrader.process_opened_file("test.py", in_file,
                                      "test_out.py", out_file))
@@ -139,7 +140,7 @@ class TestUpgradeFiles(test_util.TensorFlowTestCase):
     upgraded = "tf.multiply(a, b)\n"
     temp_file.write(original)
     temp_file.close()
-    upgrader = tf_upgrade.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade.TFAPIChangeSpec())
     upgrader.process_file(temp_file.name, temp_file.name)
     self.assertAllEqual(open(temp_file.name).read(), upgraded)
     os.unlink(temp_file.name)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9702430a1219c33e6d68875e1366ee7ebb2ce308
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import renames_v2
+
+
+class TFAPIChangeSpec(ast_edits.APIChangeSpec):
+  """List of maps that describe what changed in the API."""
+
+  def __init__(self):
+    # Maps from a function name to a dictionary that describes how to
+    # map from an old argument keyword to the new argument keyword.
+    self.function_keyword_renames = {}
+
+    # Mapping from function to the new name of the function
+    self.function_renames = renames_v2.renames
+
+    # Variables that should be changed to functions.
+    self.change_to_function = {}
+
+    # Functions that were reordered should be changed to the new keyword args
+    # for safety, if positional arguments are used. If you have reversed the
+    # positional arguments yourself, this could do the wrong thing.
+    self.function_reorders = {}
+
+    # Specially handled functions.
+    self.function_handle = {}
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 2.0
+
+Simple usage:
+  tf_convert_v2.py --infile foo.py --outfile bar.py
+  tf_convert_v2.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=False)
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree, args.copy_other_files)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 2.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..57ac04de0667b83b66853b7cee7b4a34bc9f2f2f
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -0,0 +1,83 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf 2.0 upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import tempfile
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0.
+
+  We also test whether a converted file is executable. test_file_v1_10.py
+  aims to exhaustively test that API changes are convertible and actually
+  work when run with current TensorFlow.
+  """
+
+  def _upgrade(self, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return count, report, errors, out_file.getvalue()
+
+  def testParseError(self):
+    _, report, unused_errors, unused_new_text = self._upgrade(
+        "import tensorflow as tf\na + \n")
+    self.assertTrue(report.find("Failed to parse") != -1)
+
+  def testReport(self):
+    text = "tf.acos(a)\n"
+    _, report, unused_errors, unused_new_text = self._upgrade(text)
+    # This is not a complete test, but it is a sanity test that a report
+    # is generating information.
+    self.assertTrue(report.find("Renamed function `tf.acos` to `tf.math.acos`"))
+
+  def testRename(self):
+    text = "tf.acos(a)\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.acos(a)\n")
+    text = "tf.rsqrt(tf.log(3.8))\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log(3.8))\n")
+
+
+class TestUpgradeFiles(test_util.TensorFlowTestCase):
+
+  def testInplace(self):
+    """Check to make sure we don't have a file system race."""
+    temp_file = tempfile.NamedTemporaryFile("w", delete=False)
+    original = "tf.acos(a, b)\n"
+    upgraded = "tf.math.acos(a, b)\n"
+    temp_file.write(original)
+    temp_file.close()
+    upgrader = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+    upgrader.process_file(temp_file.name, temp_file.name)
+    self.assertAllEqual(open(temp_file.name).read(), upgraded)
+    os.unlink(temp_file.name)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..feb37c902ec3359e6221937f4334ab2504394fa3
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -0,0 +1,15 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:private"])
+
+py_binary(
+    name = "generate_v2_renames_map",
+    srcs = ["generate_v2_renames_map.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+    ],
+)
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..567eceb0b6595ceac624fe8211f22885a6490d85
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""Script for updating tensorflow/tools/compatibility/renames_v2.py.
+
+To update renames_v2.py, run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+"""
+# pylint: enable=line-too-long
+
+import tensorflow as tf
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+
+
+_OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/renames_v2.py'
+_FILE_HEADER = """# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+\"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_renames_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_renames_map
+This file should be updated whenever endpoints are deprecated.
+\"\"\"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""
+
+
+def update_renames_v2(output_file_path):
+  """Writes a Python dictionary mapping deprecated to canonical API names.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  # Set of rename lines to write to output file in the form:
+  #   'tf.deprecated_name': 'tf.canonical_name'
+  rename_line_set = set()
+  # _tf_api_names attribute name
+  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects rename strings to add to rename_line_set."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      if not hasattr(attr, '__dict__'):
+        continue
+      api_names = attr.__dict__.get(tensorflow_api_attr, [])
+      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
+      canonical_name = tf_export.get_canonical_name(
+          api_names, deprecated_api_names)
+      for name in deprecated_api_names:
+        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf, visitor)
+
+  renames_file_text = '%srenames = {\n%s\n}\n' % (
+      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
+  file_io.write_string_to_file(output_file_path, renames_file_text)
+
+
+def main(unused_argv):
+  update_renames_v2(_OUTPUT_FILE_PATH)
+
+
+if __name__ == '__main__':
+  tf.app.run(main=main)
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index fd94d64268ae69a83f04124e33e5bde3c2f49b75..f7fe4119dabd5423a14d64176cb0f5debd830c8b 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -63,7 +63,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 44120bf2747f91ff1191ac1ecebdd7f0e12cc8da..340f96df483411ab8f5714a76e00fd8b5f5c6435 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -40,10 +40,6 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
-
 RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
     python get-pip.py && \
     rm get-pip.py
@@ -83,7 +79,7 @@ RUN echo "startup --batch" >>/etc/bazel.bazelrc
 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \
     >>/etc/bazel.bazelrc
 # Install the most recent bazel release.
-ENV BAZEL_VERSION 0.14.1
+ENV BAZEL_VERSION 0.15.0
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7 b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
index 3bedc8cf3462aabf25f55706b3483907c5d5b467..30bc2d28069758f20e99d84b159b63a164aece1d 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu-cuda9-cudnn7
@@ -4,7 +4,7 @@ LABEL maintainer="Gunhan Gulsoy <gunan@google.com>"
 
 # It is possible to override these for releases.
 ARG TF_BRANCH=master
-ARG BAZEL_VERSION=0.5.4
+ARG BAZEL_VERSION=0.15.0
 ARG TF_AVAILABLE_CPUS=32
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb b/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
index 0633b03259a06363d0d069eb479971f8b87f983e..8fa871ef7729a9194de282b84cdd9539c80f8555 100644
--- a/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
+++ b/tensorflow/tools/docker/notebooks/1_hello_tensorflow.ipynb
@@ -665,7 +665,7 @@
       "source": [
         "## What's next?\n",
         "\n",
-        "This has been a gentle introduction to TensorFlow, focused on what TensorFlow is and the very basics of doing anything in TensorFlow. If you'd like more, the next tutorial in the series is Getting Started with TensorFlow, also available in the [notebooks directory](..)."
+        "This has been a gentle introduction to TensorFlow, focused on what TensorFlow is and the very basics of doing anything in TensorFlow. If you'd like more, the next tutorial in the series is Getting Started with TensorFlow, also available in the [notebooks directory](../notebooks)."
       ]
     }
   ],
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
index fc93085e3e0316cf274f4d9b325d6af0ea3a2f83..f96887e4c70b0580fd8a799c8f1d602491a66ef2 100644
--- a/tensorflow/tools/docs/generate.py
+++ b/tensorflow/tools/docs/generate.py
@@ -31,6 +31,11 @@ if __name__ == '__main__':
   doc_generator = generate_lib.DocGenerator()
   doc_generator.add_output_dir_argument()
   doc_generator.add_src_dir_argument()
+  doc_generator.argument_parser.add_argument(
+      '--site_api_path',
+      type=str, default='api_docs/python',
+      help='The path from the site-root to api_docs'
+           'directory for this project')
 
   # This doc generator works on the TensorFlow codebase. Since this script lives
   # at tensorflow/tools/docs, and all code is defined somewhere inside
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index e7634cd5dcf19d5f21b0bd42b282dfe928659a52..4f70a6936490dab833dd32c30598f2e6f493feaa 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -55,7 +55,8 @@ def write_docs(output_dir,
                parser_config,
                yaml_toc,
                root_title='TensorFlow',
-               search_hints=True):
+               search_hints=True,
+               site_api_path=None):
   """Write previously extracted docs to disk.
 
   Write a docs page for each symbol included in the indices of parser_config to
@@ -73,6 +74,8 @@ def write_docs(output_dir,
     root_title: The title name for the root level index.md.
     search_hints: (bool) include meta-data search hints at the top of each
       output file.
+    site_api_path: Used to write the api-duplicates _redirects.yaml file. if
+      None (the default) the file is not generated.
 
   Raises:
     ValueError: if `output_dir` is not an absolute path
@@ -92,6 +95,9 @@ def write_docs(output_dir,
   #  - symbol name(string):pathname (string)
   symbol_to_file = {}
 
+  # Collect redirects for an api _redirects.yaml file.
+  redirects = ['redirects:\n']
+
   # Parse and write Markdown pages, resolving cross-links (@{symbol}).
   for full_name, py_object in six.iteritems(parser_config.index):
     parser_config.reference_resolver.current_doc_full_name = full_name
@@ -150,6 +156,25 @@ def write_docs(output_dir,
       raise OSError(
           'Cannot write documentation for %s to %s' % (full_name, directory))
 
+    if site_api_path:
+      duplicates = parser_config.duplicates.get(full_name, [])
+      if not duplicates:
+        continue
+
+      duplicates = [item for item in duplicates if item != full_name]
+      template = ('- from: /{}\n'
+                  '  to: /{}\n')
+      for dup in duplicates:
+        from_path = os.path.join(site_api_path, dup.replace('.', '/'))
+        to_path = os.path.join(site_api_path, full_name.replace('.', '/'))
+        redirects.append(
+            template.format(from_path, to_path))
+
+  if site_api_path:
+    api_redirects_path = os.path.join(output_dir, '_redirects.yaml')
+    with open(api_redirects_path, 'w') as redirect_file:
+      redirect_file.write(''.join(redirects))
+
   if yaml_toc:
     # Generate table of contents
 
@@ -608,7 +633,8 @@ class DocGenerator(object):
         parser_config,
         yaml_toc=self.yaml_toc,
         root_title=root_title,
-        search_hints=getattr(flags, 'search_hints', True))
+        search_hints=getattr(flags, 'search_hints', True),
+        site_api_path=getattr(flags, 'site_api_path', None))
 
     # Replace all the @{} references in files under `FLAGS.src_dir`
     replace_refs(flags.src_dir, flags.output_dir, reference_resolver, '*.md')
diff --git a/tensorflow/tools/docs/generate_lib_test.py b/tensorflow/tools/docs/generate_lib_test.py
index 7a6f9fd9f799db5a14015d77e5297955c76a51cd..de18b1325454ce4c1c02bb943f7443c3e1876d5f 100644
--- a/tensorflow/tools/docs/generate_lib_test.py
+++ b/tensorflow/tools/docs/generate_lib_test.py
@@ -107,7 +107,18 @@ class GenerateTest(googletest.TestCase):
 
     output_dir = googletest.GetTempDir()
 
-    generate_lib.write_docs(output_dir, parser_config, yaml_toc=True)
+    generate_lib.write_docs(output_dir, parser_config, yaml_toc=True,
+                            site_api_path='api_docs/python')
+
+    # Check redirects
+    redirects_file = os.path.join(output_dir, '_redirects.yaml')
+    self.assertTrue(os.path.exists(redirects_file))
+    with open(redirects_file) as f:
+      redirects = f.read()
+    self.assertEqual(redirects.split(), [
+        'redirects:', '-', 'from:', '/api_docs/python/tf/test_function', 'to:',
+        '/api_docs/python/tf/TestModule/test_function'
+    ])
 
     # Make sure that the right files are written to disk.
     self.assertTrue(os.path.exists(os.path.join(output_dir, 'index.md')))
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index f1d361e07d8f00aa37a4e063a7d17bf85de74fde..156636ab8215d9abdc9e0ed461df550f1c7ed09c 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -159,7 +159,7 @@ Status FuseScaleOffsetToConvWeights(const std::vector<float>& scale_values,
   NodeDef bias_add_node;
   bias_add_node.set_op("BiasAdd");
   bias_add_node.set_name(conv_output_name);
-  if (!conv_node.attr().count("data_format")) {
+  if (conv_node.attr().count("data_format") > 0) {
     CopyNodeAttr(conv_node, "data_format", "data_format", &bias_add_node);
   }
   CopyNodeAttr(conv_node, "T", "T", &bias_add_node);
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e661fb1adc8e59934a5d3ff761d8f64ceaa71263..ab39ed8d696625bf82ee64cee643de26fe7e32a6 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -78,7 +78,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/labeled_tensor:labeled_tensor_pip",
     "//tensorflow/contrib/nn:nn_py",
     "//tensorflow/contrib/predictor:predictor_pip",
-    "//tensorflow/contrib/proto:proto_pip",
+    "//tensorflow/contrib/proto:proto",
     "//tensorflow/contrib/receptive_field:receptive_field_pip",
     "//tensorflow/contrib/rpc:rpc_pip",
     "//tensorflow/contrib/session_bundle:session_bundle_pip",
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 4101b34a11037be4c82791e786a79c6b7262114c..ca40f2eaa81128b5091899702f82f69aa7984a07 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -17,8 +17,12 @@
 
 set -e
 
+function is_absolute {
+  [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
+}
+
 function real_path() {
-  [[ $1 = /* ]] && echo "$1" || echo "$PWD/${1#./}"
+  is_absolute "$1" && echo "$1" || echo "$PWD/${1#./}"
 }
 
 function cp_external() {
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index c630ca04b885d35da6550d4e5f3e6912b5fd7a00..1f4c3d47bfe532d12635df0566ed3e6cef5e6a33 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.9.0-rc0'
+_VERSION = '1.9.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -55,7 +55,7 @@ REQUIRED_PACKAGES = [
     'six >= 1.10.0',
     'protobuf >= 3.6.0',
     'setuptools <= 39.1.0',
-    'tensorboard >= 1.8.0, < 1.9.0',
+    'tensorboard >= 1.10.0, < 1.11.0',
     'termcolor >= 1.1.0',
 ]
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4b4f31813c36b1f41468f81644de65a986e23099..45b1abeb10de2e22b5c5760e9e248242ad02fbab 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -144,13 +144,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "ortools_archive",
       urls = [
-          "https://mirror.bazel.build/github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
-          # Please uncomment me, when the next upgrade happens. Then
-          # remove the whitelist entry in third_party/repo.bzl.
-          # "https://github.com/google/or-tools/archive/253f7955c6a1fd805408fba2e42ac6d45b312d15.tar.gz",
+          "https://mirror.bazel.build/github.com/google/or-tools/archive/v6.7.2.tar.gz",
+          "https://github.com/google/or-tools/archive/v6.7.2.tar.gz",
       ],
-      sha256 = "932075525642b04ac6f1b50589f1df5cd72ec2f448b721fd32234cf183f0e755",
-      strip_prefix = "or-tools-253f7955c6a1fd805408fba2e42ac6d45b312d15/src",
+      sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9",
+      strip_prefix = "or-tools-6.7.2/src",
       build_file = clean_dep("//third_party:ortools.BUILD"),
   )
 
@@ -487,11 +485,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/10c3b3d15ed6a788ac12221b784caf81fb8248b5.tar.gz",
-	  "https://github.com/llvm-mirror/llvm/archive/10c3b3d15ed6a788ac12221b784caf81fb8248b5.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428.tar.gz",
+	  "https://github.com/llvm-mirror/llvm/archive/7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428.tar.gz",
       ],
-      sha256 = "a9feb6b47267c30fd7c19ebfdf4dbde6757054f716fa77c09bcb1106799c3253",
-      strip_prefix = "llvm-10c3b3d15ed6a788ac12221b784caf81fb8248b5",
+      sha256 = "c6cbb21acd46e3e00faa8c379595ecffb99ef77622da17f29371db2bfad1d3d3",
+      strip_prefix = "llvm-7b3bfc8151f3a6bcd9642c49c1f86f66cc43a428",
       build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
   )
 
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/Core b/third_party/eigen3/unsupported/Eigen/CXX11/Core
deleted file mode 100644
index 1b3690716c03ca635755d920cd3be598cb920c6a..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/Core
+++ /dev/null
@@ -1,46 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_CORE_MODULE
-#define EIGEN_CXX11_CORE_MODULE
-
-#include <Eigen/Core>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-
-/** \defgroup CXX11_Core_Module C++11 Core Module
-  *
-  * This module provides common core features for all modules that
-  * explicitly depend on C++11. Currently, this is only the Tensor
-  * module. Note that at this stage, you should not need to include
-  * this module directly.
-  *
-  * It also provides a limited fallback for compilers that don't support
-  * CXX11 yet, such as nvcc.
-  *
-  * \code
-  * #include <Eigen/CXX11/Core>
-  * \endcode
-  */
-
-// Only a subset of cxx11 is allowed at Google, so we default to emulate the
-// cxx11 functionality that we need.
-#include "src/Core/util/FixedSizeVector.h"
-#if 1
-#include <vector>
-#include "src/Core/util/EmulateCXX11Meta.h"
-#else
-#include "src/Core/util/CXX11Workarounds.h"
-#include "src/Core/util/CXX11Meta.h"
-#endif
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-
-#endif // EIGEN_CXX11_CORE_MODULE
-
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks b/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
deleted file mode 100644
index 7741b68d8a73dfc738f73e4630b5e2020de50756..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/NeuralNetworks
+++ /dev/null
@@ -1,35 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_MODULE
-#define EIGEN_CXX11_NEURAL_NETWORKS_MODULE
-
-#include "unsupported/Eigen/CXX11/Tensor"
-
-/** \defgroup CXX11_NeuralNetworks_Module Neural Networks Module
-  *
-  * This module provides an efficient implementation of the common primitives
-  * used by neural networks.
-  * The primitives are  built on top of the tensor library.
-  *
-  * \code
-  * #include <Eigen/CXX11/NeuralNetworks>
-  * \endcode
-  */
-
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h"
-#include "unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h"
-
-#endif  // EIGEN_CXX11_NEURAL_NETWORKS_MODULE
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
index 6b625abc3e569ffcd50aa978b3f715024d36cb0b..5ab36649187a41507f1201804090a801d7f639f9 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_FIXED_POINT_TYPES_H
-#define EIGEN_CXX11_FIXED_POINT_TYPES_H
+#ifndef CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
+#define CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
 
 #include <cmath>
 #include <iostream>
@@ -339,4 +339,4 @@ EIGEN_STRONG_INLINE std::ostream& operator<<(std::ostream& os, QInt32 a) {
 
 }  // namespace Eigen
 
-#endif  // EIGEN_CXX11_FIXED_POINT_TYPES_H
+#endif  // CXX11_SRC_FIXEDPOINT_FIXEDPOINTTYPES_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
index 4d0dca07df05f6a98a13763c53977445a2ffd0ca..e6f4080ae127a93fc7830a8dcded1b74f581188f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
@@ -7,9 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
-#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
-
+#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
+#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
 
 namespace Eigen {
 namespace internal {
@@ -24,6 +23,14 @@ template<> struct scalar_product_traits<QInt8, QInt8>
   typedef QInt32 ReturnType;
 };
 
+// Accumulate the product of 2 QInt16 inputs on 32 bits to prevent
+// overflows
+template <>
+struct scalar_product_traits<QInt16, QInt16> {
+  enum { Defined = 1 };
+  typedef QInt32 ReturnType;
+};
+
 // Accumulate the product of QInt8 inputs with QUint8 inputs on 32 bits
 // to prevent overflows
 template<> struct scalar_product_traits<QInt8, QUInt8>
@@ -247,9 +254,76 @@ void gebp_kernel<QUInt8, QInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }
 #endif
 
-}  // namespace internal
-}  // namespace Eigen
+#ifndef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt16 LhsScalar;
+  typedef QInt16 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // register block size along the M and N directions
+    // One for the current implementation
+    nr = 1,
+    mr = 1,
+    // Progress made at each iteration of the product loop
+    // also 1 for the current implementation
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+};
+
+// The signed 16bit Mat-Mat product itself.
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt16* blockA,
+                  const QInt16* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
 
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  for (Index j = 0; j < cols; ++j) {
+    Index startB = j * depth;
 
+    for (Index i = 0; i < rows; ++i) {
+      Index startA = i * depth;
+
+      for (Index k = 0; k < depth; ++k) {
+        res(i, j) += blockA[startA + k] * blockB[startB + k];
+      }
+    }
+  }
+}
+#endif
+
+}  // namespace internal
+}  // namespace Eigen
 
-#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_H
+#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCT_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
index 6b4b0edcfb619de4b4118797ae9592ff6f3c2dbf..66532fb60028789df7495bc54c833622187e79bf 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
@@ -3,17 +3,493 @@
 //
 // Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
 // Copyright (C) 2015 Matthew Sarett <msarett@google.com>
+// Copyright (C) 2016 Nishant Patil <nishantpatil@google.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
-#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
+#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
 
 namespace Eigen {
 namespace internal {
 
+// AVX2 optimized implementation of Mat-Mat product.
+// LHS is encoded using signed 16-bit integers.
+// RHS is encoded using signed 16-bit integers.
+#ifdef EIGEN_USE_OPTIMIZED_INT16_INT16_MAT_MAT_PRODUCT
+
+// Define quantized traits
+template <bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<QInt16, QInt16, _ConjLhs, _ConjRhs> {
+ public:
+  typedef QInt16 LhsScalar;
+  typedef QInt16 RhsScalar;
+  typedef QInt32 ResScalar;
+
+  enum {
+    // Define register blocking scheme.
+    nr = 16,
+    mr = 16,
+    kr = 4,
+    // Ignore progress tracking per loop iteration.
+    LhsProgress = -1,
+    RhsProgress = -1
+  };
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContractionThreadPool, inputs must have dimensions that are
+// multiples of 32.
+template <typename Index, int ShardingType>
+class TensorContractionBlocking<QInt16, QInt16, Index, ShardingType> {
+ public:
+  TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1)
+      : kc_(((k + 15) / 16) * 16),
+        mc_(((m + 15) / 16) * 16),
+        nc_(((n + 15) / 16) * 16) {
+    eigen_assert(mc_ % 16 == 0);
+    eigen_assert(kc_ % 16 == 0);
+    if (!k || !m || !n) {
+      return;
+    }
+
+    if (ShardingType == ShardByCol) {
+      eigen_assert(nc_ % 16 == 0);
+      nc_ = (((nc_ / num_threads) + 15) / 16) * 16;
+    } else {
+      eigen_assert(nc_ % 16 == 0);
+      mc_ = (((mc_ / num_threads) + 15) / 16) * 16;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
+
+ private:
+  Index kc_;
+  Index mc_;
+  Index nc_;
+};
+
+// Specialized blocking for quantized implementations.
+// Used by TensorContraction and GeneralMatrixMatrix, inputs are padded to
+// multiples of 32.
+template <int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
+class gemm_blocking_space<ColMajor, QInt16, QInt16, MaxRows, MaxCols, MaxDepth,
+                          KcFactor, false>
+    : public level3_blocking<QInt16, QInt16> {
+  DenseIndex m_sizeA;
+  DenseIndex m_sizeB;
+
+ public:
+  gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth,
+                      DenseIndex /*num_threads*/, bool /*l3_blocking*/) {
+    this->m_mc = ((rows + 15) / 16) * 16;
+    this->m_nc = ((cols + 15) / 16) * 16;
+    this->m_kc = ((depth + 15) / 16) * 16;
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<QInt16>(m_sizeA);
+  }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<QInt16>(m_sizeB);
+  }
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
+};
+
+// Below are the fully optimized versions that are correct only for sizes that
+// are multiple of 16.  It is about a 10% performance benefit to keep these
+// implementations separate.
+
+// Arrange a block of the left input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+//
+// Packing with m = 8 yields row major output (A0 beside B0 in memory):
+// A0 B0
+// A1 B1
+// A2 B2
+// A3 B3
+// A4 B4
+// A5 B5
+// A6 B6
+// A7 B7
+// ...
+//
+// The purpose is to collect m rows of size k.  Two elements of the same
+// row are arranged contiguously because madd performs an adjacent addition
+// in the kernel.
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+                     Conjugate, PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt16* blockA, const DataMapper& lhs,
+                                    Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2,
+          bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<QInt16, Index, DataMapper, Pack1, Pack2,
+                                     ColMajor, Conjugate, PanelMode>::
+operator()(QInt16* blockA, const DataMapper& lhs, Index depth, Index rows,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (rows % 16 != 0 || depth % 16 != 0) {
+    assert(false &&
+           "only depths and rows that are a multiple of 16 are currently "
+           "supported");
+    // gemm_pack_lhs_any<QInt16, Index, DataMapper, Pack1, Pack2, ColMajor,
+    // Conjugate, PanelMode> lhs_pack;
+    // return lhs_pack(blockA, lhs, depth, rows, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockA_256 = reinterpret_cast<__m256i*>(blockA);
+
+  // Pack rows in sets of 16
+  for (Index m = 0; m < rows; m += 16) {
+    // Pack depth in sets of 4
+    for (Index k = 0; k < depth; k += 4) {
+      // Load vectors
+      __m256i L_A = lhs.loadPacket(m, k);
+      __m256i L_B = lhs.loadPacket(m, k + 1);
+      __m256i L_C = lhs.loadPacket(m, k + 2);
+      __m256i L_D = lhs.loadPacket(m, k + 3);
+
+      // Rearrange the inputs as required by the kernel
+      __m256i L_AB0_AB7 = _mm256_unpacklo_epi16(L_A, L_B);
+      __m256i L_AB8_AB15 = _mm256_unpackhi_epi16(L_A, L_B);
+      __m256i L_CD0_CD7 = _mm256_unpacklo_epi16(L_C, L_D);
+      __m256i L_CD8_CD15 = _mm256_unpackhi_epi16(L_C, L_D);
+
+      __m256i L_AD0 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD0);
+      __m256i L_AD8 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x20);
+      _mm256_store_si256(blockA_256++, L_AD8);
+      __m256i L_AD16 = _mm256_permute2x128_si256(L_AB0_AB7, L_AB8_AB15, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD16);
+      __m256i L_AD24 = _mm256_permute2x128_si256(L_CD0_CD7, L_CD8_CD15, 0x31);
+      _mm256_store_si256(blockA_256++, L_AD24);
+    }
+  }
+}
+
+// Arrange a block of the right input matrix in contiguous memory.
+//
+// Given column major input (A0 beside A1 in memory):
+// A0 B0 C0 D0 E0 F0 G0 H0 ...
+// A1 B1 C1 D1 E1 F1 G1 H1 ...
+// A2 B2 C2 D2 E2 F2 G2 H2 ...
+// A3 B3 C3 D3 E3 F3 G3 H3 ...
+// A4 B4 C4 D4 E4 F4 G4 H4 ...
+// A5 B5 C5 D5 E5 F5 G5 H5 ...
+// A6 B6 C6 D6 E6 F6 G6 H6 ...
+// A7 B7 C7 D7 E7 F7 G7 H7 ...
+// A8 ...
+// ...
+// Packing yields row major output (A0 beside A1 in memory):
+// A0 A1 A2 A3 A4 A5 A6 A7
+// B0 B1 B2 B3 B4 B5 B6 B7
+// ...
+//
+// At least two elements of the same col are arranged contiguously because
+// maddubs and madd both perform an adjacent addition in the kernel.  We can
+// save work by leaving 4 adjacent elements because kr = 4.
+// The purpose is to collect n cols of size k.  Two elements of the same
+// col are arranged contiguously because madd performs an adjacent addition
+// in the kernel.
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
+                     PanelMode> {
+  EIGEN_DONT_INLINE void operator()(QInt16* blockB, const DataMapper& rhs,
+                                    Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void
+gemm_pack_rhs<QInt16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::
+operator()(QInt16* blockB, const DataMapper& rhs, Index depth, Index cols,
+           Index stride, Index offset) {
+  eigen_assert(stride == 0);
+  eigen_assert(offset == 0);
+
+  // Use alternate function for weird sizes
+  if (cols % 16 != 0 || depth % 16 != 0) {
+    assert(false &&
+           "only depths and cols that are a multiple of 16 are currently "
+           "supported");
+    // gemm_pack_rhs_any<QInt16, Index, DataMapper, nr, ColMajor, Conjugate,
+    // PanelMode> rhs_pack;
+    // return rhs_pack(blockB, rhs, depth, cols, stride, offset);
+  }
+
+  // Get vector pointer
+  __m256i* blockB_256 = reinterpret_cast<__m256i*>(blockB);
+
+  // Perform a step of the packing for 4 columns
+  __m256i R_AB_L, R_AB_H, R_CD_L, R_CD_H, R_AD_0, R_AD_4, R_AD_8, R_AD_12;
+#define PACK_STEP                                            \
+  R_AB_L = _mm256_unpacklo_epi64(R_A, R_B);                  \
+  R_CD_L = _mm256_unpacklo_epi64(R_C, R_D);                  \
+  R_AB_H = _mm256_unpackhi_epi64(R_A, R_B);                  \
+  R_CD_H = _mm256_unpackhi_epi64(R_C, R_D);                  \
+  R_AD_0 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x20);  \
+  R_AD_8 = _mm256_permute2x128_si256(R_AB_L, R_CD_L, 0x31);  \
+  R_AD_4 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x20);  \
+  R_AD_12 = _mm256_permute2x128_si256(R_AB_H, R_CD_H, 0x31); \
+  _mm256_store_si256(blockB_256, R_AD_0);                    \
+  _mm256_store_si256(blockB_256 + 4, R_AD_4);                \
+  _mm256_store_si256(blockB_256 + 8, R_AD_8);                \
+  _mm256_store_si256(blockB_256 + 12, R_AD_12);              \
+  blockB_256++;
+
+  // Pack cols in sets of 16
+  for (Index n = 0; n < cols; n += 16) {
+    // Pack depth in sets of 16
+    for (Index k = 0; k < depth; k += 16) {
+      __m256i R_A = rhs.loadPacket(k, n);
+      __m256i R_B = rhs.loadPacket(k, n + 1);
+      __m256i R_C = rhs.loadPacket(k, n + 2);
+      __m256i R_D = rhs.loadPacket(k, n + 3);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 4);
+      R_B = rhs.loadPacket(k, n + 5);
+      R_C = rhs.loadPacket(k, n + 6);
+      R_D = rhs.loadPacket(k, n + 7);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 8);
+      R_B = rhs.loadPacket(k, n + 9);
+      R_C = rhs.loadPacket(k, n + 10);
+      R_D = rhs.loadPacket(k, n + 11);
+      PACK_STEP;
+
+      R_A = rhs.loadPacket(k, n + 12);
+      R_B = rhs.loadPacket(k, n + 13);
+      R_C = rhs.loadPacket(k, n + 14);
+      R_D = rhs.loadPacket(k, n + 15);
+      PACK_STEP;
+
+      blockB_256 += 12;
+    }
+  }
+#undef PACK_STEP
+}
+
+// Perform the actual multiplication on packed inputs
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+                   ConjugateRhs> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+
+  EIGEN_DONT_INLINE
+  void operator()(const DataMapper& res, const QInt16* blockA,
+                  const QInt16* blockB, Index rows, Index depth, Index cols,
+                  QInt32 alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<QInt16, QInt16, Index, DataMapper, mr, nr,
+                                   ConjugateLhs, ConjugateRhs>::
+operator()(const DataMapper& res, const QInt16* blockA, const QInt16* blockB,
+           Index rows, Index depth, Index cols, QInt32 alpha, Index strideA,
+           Index strideB, Index offsetA, Index offsetB) {
+  EIGEN_STATIC_ASSERT(!ConjugateLhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT(!ConjugateRhs, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  eigen_assert(alpha.value == 1);
+  eigen_assert(strideA == -1);
+  eigen_assert(strideB == -1);
+  eigen_assert(offsetA == 0);
+  eigen_assert(offsetB == 0);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+  eigen_assert(depth > 0);
+  eigen_assert(blockA);
+  eigen_assert(blockB);
+
+  // Use alternate function for weird sizes
+  if (rows % 16 != 0 || cols % 16 != 0 || depth % 16 != 0) {
+    assert(false &&
+           "only depths, cols and rows that are a multiple of 16 are currently "
+           "supported");
+    // gebp_kernel_any<QInt16, QInt16, Index, DataMapper, mr, nr, ConjugateLhs,
+    // ConjugateRhs> gebp;
+    // return gebp(res, blockA, blockB, rows, depth, cols, alpha, strideA,
+    // strideB, offsetA, offsetB);
+  }
+
+  // Create result block
+  QInt32* blockO = aligned_new<QInt32>(16 * 16);
+  memset(blockO, 0, 16 * 16 * sizeof(QInt32));
+
+  // Get vectorized pointers
+  __m256i* blockO_256 = reinterpret_cast<__m256i*>(blockO);
+  const __m256i* blockA_256 = reinterpret_cast<const __m256i*>(blockA);
+  const __m256i* blockB_256 = reinterpret_cast<const __m256i*>(blockB);
+
+  // Loop over blocks of 16 columns
+  for (Index n = 0; n < cols; n += 16) {
+    // Reset index into blockA
+    Index indexL = 0;
+    // Loop over blocks of 16 rows
+    for (Index m = 0; m < rows; m += 16) {
+      // Reset index into blockB
+      Index indexR = n / 16 * depth;
+      // Loop over blocks of 4 on depth
+      for (Index k = 0; k < depth; k += 4) {
+        // Load inputs
+        __m256i L_AD0 = blockA_256[indexL++];
+        __m256i L_AD8 = blockA_256[indexL++];
+        __m256i L_EH0 = blockA_256[indexL++];
+        __m256i L_EH8 = blockA_256[indexL++];
+
+        __m256i R_AH0 = blockB_256[indexR++];
+        __m256i R_AH4 = blockB_256[indexR++];
+        __m256i R_AH8 = blockB_256[indexR++];
+        __m256i R_AH12 = blockB_256[indexR++];
+
+        // Declare variables used in COMPUTE_STEP
+        __m256i P_32_A, P_32_B, P_32;
+
+#define COMPUTE_STEP(R_INPUT_A, R_INPUT_B, OFFSET)                         \
+  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_AD0);                            \
+  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_AD8);                            \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
+  _mm256_store_si256(                                                      \
+      blockO_256 + 2 * OFFSET,                                             \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET), P_32)); \
+                                                                           \
+  P_32_A = _mm256_madd_epi16(R_INPUT_A, L_EH0);                            \
+  P_32_B = _mm256_madd_epi16(R_INPUT_B, L_EH8);                            \
+  P_32 = _mm256_add_epi32(P_32_A, P_32_B);                                 \
+  _mm256_store_si256(                                                      \
+      blockO_256 + 2 * OFFSET + 1,                                         \
+      _mm256_add_epi32(_mm256_load_si256(blockO_256 + 2 * OFFSET + 1), P_32));
+
+        // Permute and shuffle to copy a single value across the entire vector
+        // Then compute the multiplication
+        // Replicate lower 128-bits of R_AH0 across both lanes
+        __m256i R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x00);
+        // Copy first two elements of R_AH0 across entire vector
+        __m256i R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        // Copy second two elements of R_AH0 across entire vector
+        __m256i R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+
+        COMPUTE_STEP(R_AD0, R_EH0, 0);
+        __m256i R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 1);
+
+        // Replicate upper 128-bits of R_AH0 across both lanes
+        R_AH0_ = _mm256_permute2x128_si256(R_AH0, R_AH0, 0x11);
+        __m256i R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        __m256i R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 2);
+        __m256i R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        __m256i R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 3);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 4);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 5);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH4, R_AH4, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 6);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 7);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 8);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 9);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH8, R_AH8, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 10);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 11);
+
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x00);
+        R_AD0 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH0 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD0, R_EH0, 12);
+        R_AD1 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH1 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD1, R_EH1, 13);
+        R_AH0_ = _mm256_permute2x128_si256(R_AH12, R_AH12, 0x11);
+        R_AD2 = _mm256_shuffle_epi32(R_AH0_, 0x00);
+        R_EH2 = _mm256_shuffle_epi32(R_AH0_, 0x55);
+        COMPUTE_STEP(R_AD2, R_EH2, 14);
+        R_AD3 = _mm256_shuffle_epi32(R_AH0_, 0xAA);
+        R_EH3 = _mm256_shuffle_epi32(R_AH0_, 0xFF);
+        COMPUTE_STEP(R_AD3, R_EH3, 15);
+
+#undef COMPUTE_STEP
+      }
+
+      // Transfer the results to the result matrix
+      Index i = 0;
+      for (Index j = n; j < n + 16; j++) {
+        LinearMapper r0 = res.getLinearMapper(m, j);
+        LinearMapper r1 = res.getLinearMapper(m + 8, j);
+
+        r0.storePacket(0, _mm256_add_epi32(blockO_256[i++], r0.loadPacket(0)));
+        r1.storePacket(0, _mm256_add_epi32(blockO_256[i++], r1.loadPacket(0)));
+      }
+
+      // Zero the result block so it can be reused
+      memset(blockO, 0, 16 * 16 * sizeof(QInt32));
+    }
+  }
+  aligned_delete(blockO, 16 * 16);
+}
+
+#endif
+
 // AVX2 optimized implementation of Mat-Mat product.
 // LHS is encoded using signed 8-bit integers.
 // RHS is encoded using unsigned 8-bit integers.
@@ -1751,4 +2227,4 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }  // namespace internal
 }  // namespace Eigen
 
-#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_AVX2_H
+#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
index 99894cafb54255e4a47e1b44a9b7abd962b83188..9cd31570231173337ef0a7049171055bca897be4 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
@@ -8,9 +8,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
-#define EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
-
+#ifndef CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
+#define CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
 
 namespace Eigen {
 namespace internal {
@@ -90,6 +89,4 @@ void gebp_kernel<QInt8, QUInt8, Index, DataMapper, mr, nr, ConjugateLhs, Conjuga
 }  // namespace internal
 }  // namespace Eigen
 
-
-
-#endif  // EIGEN_CXX11_FIXED_POINT_MAT_MAT_PRODUCT_NEON_H
+#endif  // CXX11_SRC_FIXEDPOINT_MATMATPRODUCTNEON_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
index 18b5085b896ee5e14e22b771eb3f343019a01c40..ad11d3d44b813830c87f2634a9234adfeac80329 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
@@ -7,9 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
-#define EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
-
+#ifndef CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
+#define CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
 
 namespace Eigen {
 namespace internal {
@@ -47,6 +46,36 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QInt8,LhsMapper,ColMa
   }
 }
 
+// Mat-Vec product
+// Both lhs and rhs are encoded as 16bit signed integers
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, QInt16, LhsMapper, ColMajor,
+                                     ConjugateLhs, QInt16, RhsMapper,
+                                     ConjugateRhs, Version> {
+  EIGEN_DONT_INLINE static void run(Index rows, Index cols,
+                                    const LhsMapper& lhs, const RhsMapper& rhs,
+                                    QInt32* res, Index resIncr, QInt16 alpha);
+};
+
+template <typename Index, typename LhsMapper, bool ConjugateLhs,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DONT_INLINE void general_matrix_vector_product<
+    Index, QInt16, LhsMapper, ColMajor, ConjugateLhs, QInt16, RhsMapper,
+    ConjugateRhs, Version>::run(Index rows, Index cols, const LhsMapper& lhs,
+                                const RhsMapper& rhs, QInt32* res,
+                                Index resIncr, QInt16 alpha) {
+  eigen_assert(alpha.value == 1);
+  eigen_assert(resIncr == 1);
+  eigen_assert(rows > 0);
+  eigen_assert(cols > 0);
+
+  for (Index i = 0; i < rows; ++i) {
+    for (Index j = 0; j < cols; ++j) {
+      res[i] += lhs(i, j) * rhs(j, 0);
+    }
+  }
+}
 
 // Mat-Vec product
 // The lhs is encoded using 8bit signed integers, the rhs using 8bit unsigned integers
@@ -118,6 +147,4 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,QUInt8,LhsMapper,ColM
 }  // namespace internal
 }  // namespace Eigen
 
-
-
-#endif  // EIGEN_CXX11_FIXED_POINT_MAT_VEC_PRODUCT_H
+#endif  // CXX11_SRC_FIXEDPOINT_MATVECPRODUCT_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
index cb1636256d7d5e0a9a11824a6c25b18fe79f4f56..3abd4ee49c2a6596ff9545faddedf926b4da857f 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
@@ -1,6 +1,5 @@
-#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
-
+#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
 #ifdef _MSC_VER
 
 #include <immintrin.h>
@@ -29,7 +28,6 @@ inline int _mm256_extract_epi8_N1(const __m256i X)
 	return _mm_extract_epi8(_mm256_extractf128_si256((X), 1 >> 4), 1 % 16);
 }
 
-
 namespace Eigen {
 namespace internal {
 
@@ -502,4 +500,4 @@ struct functor_traits<scalar_product_op<QInt32, double>> {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
+#endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
index 8f9906dbf9c0c9dd8e61964c65b36e8549a3241a..2092ce1d4c92754ce52b78f6a6e5fe814d4b7aaa 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
@@ -1,5 +1,5 @@
-#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
-#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#ifndef CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#define CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
 
 #include "PacketMathAVX2.h"
 
@@ -542,4 +542,4 @@ EIGEN_STRONG_INLINE QInt8 predux_max<Packet64q8i>(const Packet64q8i& a) {
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
+#endif  // CXX11_SRC_FIXEDPOINT_PACKETMATHAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
index 7b4ecc752ff2e6b4544a0071fc0a971c6e9879a4..9561d6a3388d69f598a61220b1dfc29d068b8eeb 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
@@ -1,5 +1,5 @@
-#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
-#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
 
 namespace Eigen {
 namespace internal {
@@ -52,8 +52,16 @@ template <>
 EIGEN_STRONG_INLINE Packet32q8u
 pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
                                 const Packet8q32i& c, const Packet8q32i& d) {
+  // _mm256_packus_epi32 trims negative numbers to 0 but we can't allow numbers
+  // that are too large because _mm256_packus_epi16 expects signed input
+  // (example of problem input: 0x11111111, which saturates to 0xffff = -1,
+  // which saturates to 0).
+  const __m256i a_clip = _mm256_min_epi32(a, _mm256_set1_epi32(255));
+  const __m256i b_clip = _mm256_min_epi32(b, _mm256_set1_epi32(255));
+  const __m256i c_clip = _mm256_min_epi32(c, _mm256_set1_epi32(255));
+  const __m256i d_clip = _mm256_min_epi32(d, _mm256_set1_epi32(255));
   const __m256i converted = _mm256_packus_epi16(
-      _mm256_packs_epi32(a.val, b.val), _mm256_packs_epi32(c.val, d.val));
+      _mm256_packus_epi32(a_clip, b_clip), _mm256_packus_epi32(c_clip, d_clip));
   // Since packus does not cross 128 bit lane boundaries,
   // we have to permute to properly order the final result.
   const __m256i permute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
@@ -63,4 +71,4 @@ pcast<Packet8q32i, Packet32q8u>(const Packet8q32i& a, const Packet8q32i& b,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
+#endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX2_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
index 26735743d487cbc4b50a744ede463f4eac6070a8..a09eac67070477ad4b7ad7fd041800d1d815cac3 100644
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+++ b/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
@@ -1,5 +1,5 @@
-#ifndef EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
-#define EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#ifndef CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#define CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
 
 namespace Eigen {
 namespace internal {
@@ -132,8 +132,15 @@ pcast<Packet16q32i, Packet64q8i>(const Packet16q32i& a,
                                  const Packet16q32i& b,
                                  const Packet16q32i& c,
                                  const Packet16q32i& d) {
-  __m512i converted = _mm512_packs_epi16(_mm512_packs_epi32(a.val, b.val),
-                                         _mm512_packs_epi32(c.val, d.val));
+  __m128i a_part = _mm512_cvtsepi32_epi8(a);
+  __m128i b_part = _mm512_cvtsepi32_epi8(b);
+  __m128i c_part = _mm512_cvtsepi32_epi8(c);
+  __m128i d_part = _mm512_cvtsepi32_epi8(d);
+  __m256i ab =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
+  __m256i cd =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
+  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
   return converted;
 }
 
@@ -141,7 +148,10 @@ template <>
 EIGEN_STRONG_INLINE Packet32q16i
 pcast<Packet16q32i, Packet32q16i>(const Packet16q32i& a,
                                   const Packet16q32i& b) {
-  __m512i converted = _mm512_packs_epi32(a.val, b.val);
+  __m256i a_part = _mm512_cvtsepi32_epi16(a);
+  __m256i b_part = _mm512_cvtsepi32_epi16(b);
+  __m512i converted =
+      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
   return converted;
 }
 
@@ -154,22 +164,45 @@ template <>
 EIGEN_STRONG_INLINE Packet64q8u
 pcast<Packet16q32i, Packet64q8u>(const Packet16q32i& a, const Packet16q32i& b,
                                  const Packet16q32i& c, const Packet16q32i& d) {
-  const __m512i converted = _mm512_packus_epi16(
-      _mm512_packus_epi32(a.val, b.val), _mm512_packus_epi32(c.val, d.val));
+  // Brute-force saturation since there isn't a pack operation for unsigned
+  // numbers that keeps the elements in order.
+  __m128i a_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(a, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i b_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(b, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i c_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(c, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m128i d_part = _mm512_cvtepi32_epi8(_mm512_max_epi32(
+      _mm512_min_epi32(d, _mm512_set1_epi32(255)), _mm512_setzero_si512()));
+  __m256i ab =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(a_part), b_part, 1);
+  __m256i cd =
+      _mm256_inserti128_si256(_mm256_castsi128_si256(c_part), d_part, 1);
+  __m512i converted = _mm512_inserti64x4(_mm512_castsi256_si512(ab), cd, 1);
   return converted;
 }
 
+#if 0
+// The type Packet32q16u does not exist for AVX-512 yet
 template <>
 struct type_casting_traits<QInt32, QUInt16> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
 };
 
-#if 0
 template <>
 EIGEN_STRONG_INLINE Packet32q16u
 pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
                                   const Packet16q32i& b) {
-  const __m512i converted = _mm512_packus_epi32(a.val, b.val);
+  // Brute-force saturation since there isn't a pack operation for unsigned
+  // numbers that keeps the elements in order.
+  __m256i a_part =
+      _mm512_cvtepi32_epi16(_mm512_max_epi32(
+        _mm512_min_epi32(a, _mm512_set1_epi32(65535)), _mm512_setzero_si512()));
+  __m256i b_part = _mm512_cvtepi32_epi16(
+    _mm512_max_epi32(_mm512_min_epi32(b, _mm512_set1_epi32(65535)),
+                     _mm512_setzero_si512()));
+  __m512i converted =
+      _mm512_inserti64x4(_mm512_castsi256_si512(a_part), b_part, 1);
   return converted;
 }
 #endif
@@ -177,4 +210,4 @@ pcast<Packet16q32i, Packet32q16u>(const Packet16q32i& a,
 }  // end namespace internal
 }  // end namespace Eigen
 
-#endif  // EIGEN3_UNSUPPORTED_EIGEN_CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
+#endif  // CXX11_SRC_FIXEDPOINT_TYPECASTINGAVX512_H_
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
deleted file mode 100644
index cbcce9e282685b94842dfcc9cce0e3c5962086f7..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Activations.h
+++ /dev/null
@@ -1,116 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
-
-namespace Eigen {
-
-/** scalar_sigmoid_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a sigmoid
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::sigmoid_fast_derivative()
-  */
-template <typename T>
-struct scalar_sigmoid_fast_derivative_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_fast_derivative_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
-    const T one = T(1);
-    return (one - y) * y;
-  }
-
-  template <typename Packet>
-  inline Packet packetOp(const Packet& y) const {
-    const Packet one = internal::pset1<Packet>(1);
-    return internal::pmul(internal::psub(one, y), y);
-  }
-};
-
-namespace internal {
-template <typename T>
-struct functor_traits<scalar_sigmoid_fast_derivative_op<T> > {
-  enum {
-    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost,
-    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
-                   packet_traits<T>::HasNegate
-  };
-};
-}  // namespace internal
-
-/** scalar_tanh_fast_derivative_op
-  * \ingroup CXX11_NeuralNetworks_Module
-  * \brief Template functor to compute the fast derivative of a tanh
-  *
-  * Input should be the backpropagated gradient.
-  *
-  * \sa class CwiseUnaryOp, Cwise::tanh_fast_derivative()
-  */
-template <typename T>
-struct scalar_tanh_fast_derivative_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tanh_fast_derivative_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& y) const {
-    const T one = T(1);
-    return one - (y * y);
-  }
-
-  template <typename Packet>
-  inline Packet packetOp(const Packet& y) const {
-    const Packet one = internal::pset1<Packet>(1);
-    return internal::psub(one, internal::pmul(y, y));
-  }
-};
-
-namespace internal {
-template <typename T>
-struct functor_traits<scalar_tanh_fast_derivative_op<T> > {
-  enum {
-    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 1,
-    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasMul &&
-                   packet_traits<T>::HasNegate
-  };
-};
-}  // namespace internal
-
-/**
- * \ingroup CXX11_NeuralNetworks_Module
- * \brief Template functor to clip the magnitude of the first scalar.
- *
- * \sa class CwiseBinaryOp, MatrixBase::Clip
- */
-template <typename Scalar>
-struct scalar_clip_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_clip_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& a, const Scalar& b) const {
-    return numext::mini(numext::maxi(a, -b), b);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& a, const Packet& b) const {
-    return internal::pmin(internal::pmax(a, internal::pnegate(b)), b);
-  }
-};
-
-namespace internal {
-template <typename Scalar>
-struct functor_traits<scalar_clip_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost * 3,
-    PacketAccess = packet_traits<Scalar>::HasMax &&
-                   packet_traits<Scalar>::HasMin &&
-                   packet_traits<Scalar>::HasNegate
-  };
-};
-}  // namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_NEURAL_NETWORKS_ACTIVATIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
deleted file mode 100644
index d4bc7a3515a91fc5048a811fd710507cd7692e66..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Attention.h
+++ /dev/null
@@ -1,209 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
-
-namespace Eigen {
-
-/** ExtractGlimpses
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Extract glimpses from an input tensor.
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 4 (depth, x, y, and batch).
-  * The width and height parameters specify the extension of the returned glimpses.
-  * The offsets parameter specifies the x, y locations of the center of the glimpses relative to the center of the input image. The vector is expected to contain one IndexPair for each image in the batch dimension.
-  * The normalized boolean indicates if incoming coordinates are normalized so that 0.0 and 1.0 correspond to the minimum and maximum of each height and width dimension.
-  * The centered boolean indicates if incoming coordinates are centered relative to the image, in which case -1.0 and 1.0 correspond to minimum and maximum of each dimension while 0.0 corresponds to the center.
-  *
-  * The result can be assigned to a tensor of rank equal to that of the input. The result will be laid out in col-major order (depth, x, y, batch).
-  * The dimensions of the result will be equal to the dimensions of the input except for width and height which will be equal to the requested glimpse size.
-  */
-namespace {
-template <typename Index>
-struct GlimpseExtractionOp {
-  GlimpseExtractionOp(const Index width, const Index height,
-                      const std::vector<IndexPair<float> >& offsets,
-                      const bool normalized,
-                      const bool centered,
-                      const bool uniform_noise) :
-      width_(width), height_(height), offsets_(offsets),
-      normalized_(normalized), centered_(centered), uniform_noise_(uniform_noise) { }
-
-  template <typename Input>
-  DSizes<Index, 4> dimensions(const Input& input) const {
-    typedef typename internal::traits<Input>::Index IndexType;
-    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
-    Ref in(input);
-
-    DSizes<Index, 4> dims = in.dimensions();
-
-    dims[0] = in.dimension(0);
-    dims[1] = width_;
-    dims[2] = height_;
-    dims[3] = in.dimension(3);
-    return dims;
-  }
-
-  template <typename Input, typename Output, typename Device>
-  EIGEN_DEVICE_FUNC
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
-    typedef typename internal::traits<Input>::Index IndexType;
-    typedef TensorRef<Tensor<typename internal::traits<Input>::Scalar, 4,
-                             internal::traits<Input>::Layout, IndexType> > Ref;
-    Ref in(input);
-
-    const Index num_channels = in.dimension(0);
-    const Index input_width = in.dimension(1);
-    const Index input_height = in.dimension(2);
-    const Index batch_size = in.dimension(3);
-    eigen_assert(input_width > 0);
-    eigen_assert(input_height > 0);
-
-    for (Index i = 0; i < batch_size; ++i) {
-      float x = offsets_[i].first, y = offsets_[i].second;
-
-      // Un-normalize coordinates back to pixel space if normalized.
-      if (normalized_) {
-        x *= input_width;
-        y *= input_height;
-      }
-      // Un-center if coordinates are centered on the image center.
-      if (centered_) {
-        x /= 2.0f;
-        y /= 2.0f;
-        x += input_width / 2.0f;
-        y += input_height / 2.0f;
-      }
-      // Remove half of the glimpse window.
-      x -= width_ / 2.0f;
-      y -= height_ / 2.0f;
-
-      const Index offset_x = (Index) x;
-      const Index offset_y = (Index) y;
-      Index glimpse_width = width_;
-      Index glimpse_height = height_;
-      bool partial_overlap = false;
-      DSizes<Index, 3> slice_offset(0, offset_x, offset_y);
-      DSizes<Index, 3> slice_extent(num_channels, width_, height_);
-      DSizes<Index, 3> base_offset(0, 0, 0);
-
-      if (offset_x < 0) {
-        slice_offset[1] = 0;
-        glimpse_width = (std::max<Index>)(0, width_ + offset_x);
-        slice_extent[1] = glimpse_width;
-        base_offset[1] = width_ - glimpse_width;
-        partial_overlap = true;
-      } else if (offset_x + width_ >= input_width) {
-        glimpse_width = (std::max<Index>)(0, input_width - offset_x);
-        slice_extent[1] = glimpse_width;
-        partial_overlap = true;
-      }
-      if (offset_y < 0) {
-        slice_offset[2] = 0;
-        glimpse_height = (std::max<Index>)(0, height_ + offset_y);
-        slice_extent[2] = glimpse_height;
-        base_offset[2] = height_ - glimpse_height;
-        partial_overlap = true;
-      } else if (offset_y + height_ >= input_height) {
-        glimpse_height = (std::max<Index>)(0, input_height - offset_y);
-        slice_extent[2] = glimpse_height;
-        partial_overlap = true;
-      }
-      slice_extent[1] = std::min<Index>(input_width, slice_extent[1]);
-      slice_extent[2] = std::min<Index>(input_height, slice_extent[2]);
-
-      if (partial_overlap) {
-        if (uniform_noise_) {
-          // Initialize the glimpse with uniform noise.
-          typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
-          TensorFixedSize<Scalar, Sizes<> > mini;
-          mini.device(device) = input.template chip<3>(i).minimum();
-          TensorFixedSize<float, Sizes<> > range;
-          range.device(device) =
-              (input.template chip<3>(i).maximum() - mini).template cast<float>();
-
-          DSizes<Index, 3> glimpse_size(num_channels, width_, height_);
-          TensorMap<Tensor<float, 3> > tmp(NULL, glimpse_size);
-          output.template chip<3>(i).device(device) =
-              mini.reshape(Sizes<1,1,1>()).broadcast(glimpse_size) +
-              (tmp.random() * range.reshape(Sizes<1,1,1>()).broadcast(glimpse_size)).template cast<Scalar>();
-        } else {
-          // Initialize the glimpse with white noise: compute the mean and sigma
-          // of each channel, and use them to shape the gaussian.
-          DSizes<Index, 2> glimpse_size(width_, height_);
-          DSizes<Index, 2> input_size(input_width, input_height);
-          typedef typename internal::remove_const<
-            typename internal::traits<Input>::Scalar>::type Scalar;
-
-          for (int j = 0; j < num_channels; ++j) {
-            TensorFixedSize<Scalar, Sizes<> > mean;
-            mean.device(device) = input.template chip<3>(i).template chip<0>(j).template cast<float>().mean();
-            TensorFixedSize<float, Sizes<> > sigma;
-            sigma.device(device) =
-                (input.template chip<3>(i).template chip<0>(j).template cast<float>() - mean.reshape(Sizes<1,1>()).broadcast(input_size)).square().mean().sqrt();
-            TensorFixedSize<Scalar, Sizes<> > mini;
-            mini.device(device) = input.template chip<3>(i).template chip<0>(j).minimum();
-            TensorFixedSize<float, Sizes<> > maxi;
-            maxi.device(device) = input.template chip<3>(i).template chip<0>(j).maximum();
-
-            TensorMap<Tensor<float, 2> > tmp(NULL, glimpse_size);
-            output.template chip<3>(i).template chip<0>(j).device(device) =
-                (mean.reshape(Sizes<1,1>()).broadcast(glimpse_size) +
-                 (tmp.random(internal::NormalRandomGenerator<float>()) * sigma.reshape(Sizes<1,1>()).broadcast(glimpse_size)).template cast<Scalar>()).cwiseMin(maxi.reshape(Sizes<1,1>()).broadcast(glimpse_size)).cwiseMax(mini.reshape(Sizes<1,1>()).broadcast(glimpse_size));
-          }
-        }
-
-        // Copy the part of the glimpse that cover the input image if any.
-        if (glimpse_width == 0 || glimpse_height == 0) {
-          continue;
-        }
-        output.template chip<3>(i).slice(base_offset, slice_extent).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
-      } else {
-        output.template chip<3>(i).device(device) = input.template chip<3>(i).slice(slice_offset, slice_extent);
-      }
-    }
-  }
-
- private:
-  const Index width_;
-  const Index height_;
-  const std::vector<IndexPair<float> > offsets_;
-  const bool normalized_;
-  const bool centered_;
-  const bool uniform_noise_;
-};
-}
-
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const GlimpseExtractionOp<typename internal::traits<Input>::Index>, const Input>
-ExtractGlimpses(const Input& input,
-                const typename internal::traits<Input>::Index width,
-                const typename internal::traits<Input>::Index height,
-                const std::vector<IndexPair<float> >& offsets,
-                const bool normalized = true, const bool centered = true,
-                const bool uniform_noise = true)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  typedef typename internal::traits<Input>::Index Index;
-  const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
-                                      centered, uniform_noise);
-  return input.customOp(op);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_ATTENTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
deleted file mode 100644
index 12ce23444c092ea96ee4b3c8bd2c84d440f2c500..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardCuboidConvolutions.h
+++ /dev/null
@@ -1,523 +0,0 @@
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
-
-#include "Patch3d.h"
-
-namespace Eigen {
-
-/** CuboidConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 3D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
-  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
-  * output_backward and kernel have to be in the same layout.
-  *
-  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
-  *
-  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  *
-  * All dimension orders above are given for col-major, and should be reversed for row-major.
-  */
-
-template <typename OutputBackward, typename Kernel>
-EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<OutputBackward>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<OutputBackward>::Index,
-                     internal::traits<OutputBackward>::NumDimensions>,
-        const TensorContractionOp<
-            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
-            const TensorReshapingOp<
-                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
-                const TensorReverseOp<const array<bool, 5>, const Kernel>
-            >,
-            const TensorReshapingOp<
-                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
-                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
-            >
-        >
-    >,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<OutputBackward>::Index,
-                     internal::traits<OutputBackward>::NumDimensions>,
-        const TensorContractionOp<
-            const array< IndexPair<typename internal::traits<OutputBackward>::Index>, 2>,
-            const TensorReshapingOp<
-                const DSizes< typename internal::traits<OutputBackward>::Index, 3>,
-                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
-            >,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<OutputBackward>::Index, 3>,
-                const TensorReverseOp<const array<bool, 5>, const Kernel>
-            >
-        >
-    >
->::type
-CuboidConvolutionBackwardInput(
-    const Kernel& kernel, const OutputBackward& output_backward,
-    typename internal::traits<OutputBackward>::Index inputPlanes,
-    typename internal::traits<OutputBackward>::Index inputRows,
-    typename internal::traits<OutputBackward>::Index inputCols,
-    const DenseIndex stridePlanes = 1, const DenseIndex strideRows = 1,
-    const DenseIndex strideCols = 1) {
-  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
-  const TensorRef<const Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
-  const TensorRef<const Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
-
-  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the result
-  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
-  const TensorIndex kernelPlanes = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
-  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
-  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
-
-  const TensorIndex outputPlanes = isColMajor ? out.dimensions()[1] : out.dimensions()[NumDims - 2];
-  const TensorIndex outputRows = isColMajor ? out.dimensions()[2] : out.dimensions()[NumDims - 3];
-  const TensorIndex outputCols = isColMajor ? out.dimensions()[3] : out.dimensions()[NumDims - 4];
-
-  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
-  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
-  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
-  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
-
-  // Infer padding type.
-  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
-    // SAME padding.
-    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
-    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
-    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
-
-    forward_pad_z = dz - dz / 2;
-    forward_pad_y = dy - dy / 2;
-    forward_pad_x = dx - dx / 2;
-  } else {
-    // VALID padding.
-    forward_pad_z = 0;
-    forward_pad_y = 0;
-    forward_pad_x = 0;
-  }
-  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
-  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
-  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
-
-  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
-  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
-  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
-
-  eigen_assert(padding_ztop >= 0);
-  eigen_assert(padding_zbottom >= 0);
-  eigen_assert(padding_top >= 0);
-  eigen_assert(padding_left >= 0);
-  eigen_assert(padding_bottom >= 0);
-  eigen_assert(padding_right >= 0);
-
-  // The kernel has dimensions filters X channels X patch_planes X patch_rows X patch_cols.
-  // We need to reverse the kernel along the spatial dimensions.
-  array<bool, 5> kernel_reverse;
-  if (isColMajor) {
-    kernel_reverse[0] = false;
-    kernel_reverse[1] = false;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = true;
-    kernel_reverse[4] = true;
-  } else {
-    kernel_reverse[0] = true;
-    kernel_reverse[1] = true;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = false;
-    kernel_reverse[4] = false;
-  }
-
-  DSizes<TensorIndex, 3> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels;
-    kernel_dims[2] = kernelRows * kernelCols * kernelPlanes;
-  } else {
-    kernel_dims[0] = kernelRows * kernelCols * kernelPlanes;
-    kernel_dims[1] = kernelChannels;
-    kernel_dims[2] = kernelFilters;
-  }
-
-  // The output_backward has dimensions out_depth X out_planes X out_rows X out_cols X OTHERS
-  // When we extract the image patches from output_backward, it will have dimensions:
-  //   out_depth X (patch_planes * patch_rows * patch_cols) X (input_planes * input_rows * input_cols * OTHERS)
-  DSizes<TensorIndex, 3> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelFilters;
-    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
-    for (int i = 4; i < NumDims; ++i) {
-      pre_contract_dims[2] *= out.dimension(i);
-    }
-  } else {
-    pre_contract_dims[2] = kernelFilters;
-    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[0] = inputRows * inputCols * inputPlanes;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      pre_contract_dims[0] *= out.dimension(i);
-    }
-  }
-
-  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
-  // output_backward, if this is col-major, and
-  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
-  array<IndexPair<TensorIndex>, 2> contract_dims;
-  if (isColMajor) {
-    // col-major: kernel.contract(output.patches)
-    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
-  } else {
-    // row-major: output.patches.contract(kernel)
-    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
-  }
-
-  // Post contraction, the dimensions of the input_backprop is
-  //  channels X input_planes X input_rows X input_cols X OTHERS
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelChannels;
-    post_contract_dims[1] = inputPlanes;
-    post_contract_dims[2] = inputRows;
-    post_contract_dims[3] = inputCols;
-    for (int i = 4; i < NumDims; ++i) {
-      post_contract_dims[i] = out.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelChannels;
-    post_contract_dims[NumDims - 2] = inputPlanes;
-    post_contract_dims[NumDims - 3] = inputRows;
-    post_contract_dims[NumDims - 4] = inputCols;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      post_contract_dims[i] = out.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, NumDims> strides;
-  for (int i = 0; i < NumDims; i++) {
-    strides[i] = 1;
-  }
-  if (isColMajor) {
-    strides[1] = stridePlanes;
-    strides[2] = strideRows;
-    strides[3] = strideCols;
-  } else {
-    strides[NumDims - 2] = stridePlanes;
-    strides[NumDims - 3] = strideRows;
-    strides[NumDims - 4] = strideCols;
-  }
-
-  return choose(
-      Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
-      kernel.reverse(kernel_reverse)
-          .reshape(kernel_dims)
-          .contract(
-              output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
-                                                     1, 1, 1, stridePlanes, strideRows, strideCols,
-                               padding_ztop, padding_zbottom,
-                               padding_top, padding_bottom,
-                               padding_left, padding_right)
-                  .reshape(pre_contract_dims),
-              contract_dims)
-          .reshape(post_contract_dims),
-      output_backward.extract_volume_patches(kernelPlanes, kernelRows, kernelCols,
-                                             1, 1, 1, stridePlanes, strideRows, strideCols,
-                       padding_ztop, padding_zbottom,
-                       padding_top, padding_bottom,
-                       padding_left, padding_right)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reverse(kernel_reverse).reshape(kernel_dims),
-                    contract_dims)
-          .reshape(post_contract_dims));
-}
-
-
-/** CuboidConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 3D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_depth, kernel_height, kernel_width)
-  * output_backward and kernel have to be in the same layout.
-  *
-  * The dimensions of the result will be filters, depth, height, width (and others if applicable).
-  *
-  * It is possible to swap the order of the depth, width and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  *
-  * All dimension orders above are given for col-major, and should be reversed for row-major.
-  */
-template <typename OutputBackward, typename Input>
-EIGEN_ALWAYS_INLINE static const typename internal::conditional<
-    internal::traits<OutputBackward>::Layout == ColMajor,
-    const TensorShufflingOp<
-        const array<typename internal::traits<OutputBackward>::Index, 5>,
-        const TensorReverseOp<
-            const array<bool, 5>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
-                const TensorContractionOp<
-                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
-                    const TensorReshapingOp<
-                        const DSizes<typename internal::traits<Input>::Index, 3>,
-                        const Input>,
-                    const TensorReshapingOp<
-                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
-                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
-                    >
-                >
-            >
-        >
-    >,
-    const TensorShufflingOp<
-        const array<typename internal::traits<OutputBackward>::Index, 5>,
-        const TensorReverseOp<
-            const array<bool, 5>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<OutputBackward>::Index, 5>,
-                const TensorContractionOp<
-                    const array< IndexPair<typename internal::traits<Input>::Index>, 2>,
-                    const TensorReshapingOp<
-                        const DSizes< typename internal::traits<OutputBackward>::Index, 4>,
-                        const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const OutputBackward>
-                    >,
-                    const TensorReshapingOp<
-                        const DSizes<typename internal::traits<Input>::Index, 3>,
-                        const Input
-                    >
-                >
-            >
-        >
-    >
->::type
-CuboidConvolutionBackwardKernel(
-    const Input& input, const OutputBackward& output_backward,
-    typename internal::traits<Input>::Index kernelPlanes,
-    typename internal::traits<Input>::Index kernelRows,
-    typename internal::traits<Input>::Index kernelCols,
-    const DenseIndex stridePlanes = 1,
-    const DenseIndex strideRows = 1,
-    const DenseIndex strideCols = 1) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  const TensorIndex outputPlanes = isColMajor ? out.dimension(1) : out.dimension(NumDims - 2);
-  const TensorIndex outputRows = isColMajor ? out.dimension(2) : out.dimension(NumDims - 3);
-  const TensorIndex outputCols = isColMajor ? out.dimension(3) : out.dimension(NumDims - 4);
-
-  const TensorIndex kernelFilters = isColMajor ? out.dimension(0) : out.dimension(NumDims - 1);
-  const TensorIndex kernelChannels = isColMajor ? in.dimension(0) : in.dimension(NumDims - 1);
-
-  TensorIndex forward_pad_z, forward_pad_y, forward_pad_x;
-  const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes));
-  const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows));
-  const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols));
-
-  // Infer padding type.
-  if (size_z == outputPlanes && size_y == outputRows && size_x == outputCols) {
-    // SAME padding.
-    const TensorIndex dz = size_z * stridePlanes + kernelPlanes - 1 - inputPlanes;
-    const TensorIndex dy = size_y * strideRows + kernelRows - 1 - inputRows;
-    const TensorIndex dx = size_x * strideCols + kernelCols - 1 - inputCols;
-
-    forward_pad_z = dz - dz / 2;
-    forward_pad_y = dy - dy / 2;
-    forward_pad_x = dx - dx / 2;
-  } else {
-    // VALID padding.
-    forward_pad_z = 0;
-    forward_pad_y = 0;
-    forward_pad_x = 0;
-  }
-
-  const TensorIndex padding_ztop = kernelPlanes - 1 - forward_pad_z;
-  const TensorIndex padding_top = kernelRows - 1 - forward_pad_y;
-  const TensorIndex padding_left = kernelCols - 1 - forward_pad_x;
-
-  const TensorIndex padding_zbottom = inputPlanes + kernelPlanes - 1 - (outputPlanes - 1) * stridePlanes - 1 - padding_ztop;
-  const TensorIndex padding_bottom = inputRows + kernelRows - 1 - (outputRows - 1) * strideRows - 1 - padding_top;
-  const TensorIndex padding_right = inputCols + kernelCols - 1 - (outputCols - 1) * strideCols - 1 - padding_left;
-
-  eigen_assert(padding_ztop >= 0);
-  eigen_assert(padding_zbottom >= 0);
-  eigen_assert(padding_top >= 0);
-  eigen_assert(padding_left >= 0);
-  eigen_assert(padding_bottom >= 0);
-  eigen_assert(padding_right >= 0);
-
-  // The output_backward has dimensions out_depth X out_plaens X out_rows X out_cols X OTHERS
-  // When we extract the image patches from output_backward (with input as the
-  // kernel), it will have dimensions
-  //  (out_depth) X (input_planes * input_rows * input_cols) X (kernel_planes * kernel_rows * kernel_cols) X OTHERS
-  DSizes<TensorIndex, 4> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelFilters;
-    pre_contract_dims[1] = inputRows * inputCols * inputPlanes;
-    pre_contract_dims[2] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[3] = 1;
-    for (int i = 4; i < NumDims; ++i) {
-      pre_contract_dims[3] *= out.dimension(i);
-    }
-  } else {
-    pre_contract_dims[3] = kernelFilters;
-    pre_contract_dims[2] = inputRows * inputCols * inputPlanes;
-    pre_contract_dims[1] = kernelRows * kernelCols * kernelPlanes;
-    pre_contract_dims[0] = 1;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      pre_contract_dims[0] *= out.dimension(i);
-    }
-  }
-
-  // The input has dimensions in_depth X (input_planes * input_rows * input_cols) X OTHERS
-  DSizes<TensorIndex, 3> input_dims;
-  if (isColMajor) {
-    input_dims[0] = kernelChannels;
-    input_dims[1] = inputRows * inputCols * inputPlanes;
-    input_dims[2] = 1;
-    for (int i = 4; i < NumDims; ++i) {
-      input_dims[2] *= in.dimension(i);
-    }
-    eigen_assert(input_dims[2] == pre_contract_dims[3]);
-  } else {
-    input_dims[2] = kernelChannels;
-    input_dims[1] = inputRows * inputCols * inputPlanes;
-    input_dims[0] = 1;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      input_dims[0] *= in.dimension(i);
-    }
-    eigen_assert(input_dims[0] == pre_contract_dims[0]);
-  }
-
-  // We will contract along dimensions (1, 2) in in and (1, 3) in out, if
-  // this is col-major.
-  // For row-major, it's dimensions (0, 1) in in and (0, 2) in out.
-  array<IndexPair<TensorIndex>, 2> contract_dims;
-  if (isColMajor) {
-    // col-major: in.contract(output.patches)
-    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
-  } else {
-    // row-major: output.patches.contract(in)
-    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
-  }
-
-  // After the contraction, the kernel will have dimension
-  //   in_depth X out_depth X kernel_patches X kernel_rows X kernel_cols
-  // We will need to shuffle the first two dimensions and reverse the spatial dimensions.
-  // The end shape is:
-  //   out_depth X in_shape X kernel_planes X kernel_rows X kernel_cols
-
-  // This is the shape of the kernel *before* the shuffling.
-  DSizes<TensorIndex, 5> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelChannels;
-    kernel_dims[1] = kernelFilters;
-    kernel_dims[2] = kernelPlanes;
-    kernel_dims[3] = kernelRows;
-    kernel_dims[4] = kernelCols;
-  } else {
-    kernel_dims[0] = kernelCols;
-    kernel_dims[1] = kernelRows;
-    kernel_dims[2] = kernelPlanes;
-    kernel_dims[3] = kernelFilters;
-    kernel_dims[4] = kernelChannels;
-  }
-
-  // Flip filters and channels.
-  array<TensorIndex, 5> kernel_shuffle;
-  if (isColMajor) {
-    kernel_shuffle[0] = 1;
-    kernel_shuffle[1] = 0;
-    kernel_shuffle[2] = 2;
-    kernel_shuffle[3] = 3;
-    kernel_shuffle[4] = 4;
-  } else {
-    kernel_shuffle[0] = 0;
-    kernel_shuffle[1] = 1;
-    kernel_shuffle[2] = 2;
-    kernel_shuffle[3] = 4;
-    kernel_shuffle[4] = 3;
-  }
-
-  // Reverse the spatial dimensions.
-  array<bool, 5> kernel_reverse;
-  if (isColMajor) {
-    kernel_reverse[0] = false;
-    kernel_reverse[1] = false;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = true;
-    kernel_reverse[4] = true;
-  } else {
-    kernel_reverse[0] = true;
-    kernel_reverse[1] = true;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = false;
-    kernel_reverse[4] = false;
-  }
-
-  DSizes<TensorIndex, NumDims> strides;
-  for (int i = 0; i < NumDims; i++) {
-    strides[i] = 1;
-  }
-  if (isColMajor) {
-    strides[1] = stridePlanes;
-    strides[2] = strideRows;
-    strides[3] = strideCols;
-  } else {
-    strides[NumDims - 2] = stridePlanes;
-    strides[NumDims - 3] = strideRows;
-    strides[NumDims - 4] = strideCols;
-  }
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      input.reshape(input_dims)
-          .contract(
-              output_backward.extract_volume_patches(
-                                 inputPlanes, inputRows, inputCols, 1,
-                                 1, 1, stridePlanes, strideRows, strideCols,
-
-                                 padding_ztop, padding_zbottom, padding_top,
-                                 padding_bottom, padding_left, padding_right)
-                  .reshape(pre_contract_dims),
-              contract_dims)
-          .reshape(kernel_dims)
-          .reverse(kernel_reverse)
-          .shuffle(kernel_shuffle),
-      output_backward.extract_volume_patches(
-                         inputPlanes, inputRows, inputCols, 1, 1, 1,
-                         stridePlanes, strideRows, strideCols, padding_ztop,
-                         padding_zbottom, padding_top, padding_bottom,
-                         padding_left, padding_right)
-          .reshape(pre_contract_dims)
-          .contract(input.reshape(input_dims), contract_dims)
-          .reshape(kernel_dims)
-          .reverse(kernel_reverse)
-          .shuffle(kernel_shuffle));
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_CUBOID_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
deleted file mode 100644
index 0f4ada246c702a1c5138b04ebeab6fca73b35b26..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/BackwardSpatialConvolutions.h
+++ /dev/null
@@ -1,351 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
-
-namespace Eigen {
-
-/** SpatialConvolutionBackwardInput
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the input of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
-  *
-  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  *
-  */
-
-template <typename OutputBackward, typename Kernel>
-EIGEN_ALWAYS_INLINE
-static const typename internal::conditional<
-  internal::traits<OutputBackward>::Layout == ColMajor,
-  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > >,
-  TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, internal::traits<OutputBackward>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<OutputBackward>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> >, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 3>, const TensorReverseOp<const array<bool, 4>, const Kernel> > > > >::type
-SpatialConvolutionBackwardInput(const Kernel& kernel, const OutputBackward& output_backward, typename internal::traits<OutputBackward>::Index inputRows, typename internal::traits<OutputBackward>::Index inputCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
-
-  typedef typename internal::traits<OutputBackward>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
-  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Kernel>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<OutputBackward>::Layout == ColMajor);
-
-  static const int NumDims = internal::traits<OutputBackward>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the result
-  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
-  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
-  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
-
-  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
-  // inserted between consecutive kernel elements in atrous convolution
-  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
-  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
-
-  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
-  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
-
-  // Computing the forward padding
-  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
-  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
-
-  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
-  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
-  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
-  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
-
-  eigen_assert(padding_top >= 0);
-  eigen_assert(padding_left >= 0);
-  eigen_assert(padding_bottom >= 0);
-  eigen_assert(padding_right >= 0);
-
-  // The kernel has dimensions filters X channels X patch_rows X patch_cols
-  // We need to reverse the kernel along dimensions corresponding to rows and
-  // cols.
-  // TODO(yangke): we can make things slightly faster by collapsing the dimensions
-  // where we don't reverse. Try that once we have a faster compiler.
-  array<bool, 4> kernel_reverse;
-  if (isColMajor) {
-    kernel_reverse[0] = false;
-    kernel_reverse[1] = false;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = true;
-  } else {
-    kernel_reverse[0] = true;
-    kernel_reverse[1] = true;
-    kernel_reverse[2] = false;
-    kernel_reverse[3] = false;
-  }
-
-  DSizes<TensorIndex, 3> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels;
-    kernel_dims[2] = kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelRows * kernelCols;
-    kernel_dims[1] = kernelChannels;
-    kernel_dims[2] = kernelFilters;
-  }
-
-  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
-  // When we extract the image patches from output_backward, it will have dimensions
-  //   out_depth X (patch_rows * patch_cols) X (input_rows * input_cols * OTHERS)
-  DSizes<TensorIndex, 3> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelFilters;
-    pre_contract_dims[1] = kernelRows * kernelCols;
-    pre_contract_dims[2] = inputRows * inputCols;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[2] *= out.dimension(i);
-    }
-  } else {
-    pre_contract_dims[2] = kernelFilters;
-    pre_contract_dims[1] = kernelRows * kernelCols;
-    pre_contract_dims[0] = inputRows * inputCols;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= out.dimension(i);
-    }
-  }
-
-  // We will contract along dimensions (0, 2) in kernel and (0, 1) in
-  // output_backward, if this is col-major, and
-  // dimensions (0, 2) in kernel and (1, 2) in output_backward, if this row-major.
-  array<IndexPair<TensorIndex>, 2> contract_dims;
-  if (isColMajor) {
-    // col-major: kernel.contract(output.patches)
-    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
-  } else {
-    // row-major: output.patches.contract(kernel)
-    contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 2);
-  }
-
-  // Post contraction, the dimensions of the input_backprop is
-  //  channels X input_rows X input_cols X OTHERS
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelChannels;
-    post_contract_dims[1] = inputRows;
-    post_contract_dims[2] = inputCols;
-    for (int i = 3; i < NumDims; ++i) {
-      post_contract_dims[i] = out.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelChannels;
-    post_contract_dims[NumDims - 2] = inputRows;
-    post_contract_dims[NumDims - 3] = inputCols;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      post_contract_dims[i] = out.dimension(i);
-    }
-  }
-
-  return choose(Cond<internal::traits<OutputBackward>::Layout == ColMajor>(),
-                kernel.reverse(kernel_reverse).reshape(kernel_dims).contract(output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
-                output_backward.extract_image_patches(kernelRows, kernelCols, 1, 1, in_stride, in_stride, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).contract(kernel.reverse(kernel_reverse).reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
-}
-
-
-/** SpatialConvolutionBackwardKernel
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Computes the backprop for the filter of a 2D convolution.
-  *
-  * The output_backward parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
-  * The output_backward and the kernel must both be in col-major layout. The result will also be in col-major layout.
-  *
-  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the output_backward. The dimensions of the result will be filters, height, width (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  *
-  */
-// TODO(gpapan): Resolve a bug in TensorContractionInputMapper at SpatialConvolutions.h that yangke circumvented by using .reshape().reshape().
-// This can significantly accelerate SpatialConvolutionBackwardKernel.
-
-template <typename OutputBackward, typename Input>
-EIGEN_ALWAYS_INLINE
-static const typename internal::conditional<
-  internal::traits<OutputBackward>::Layout == ColMajor,
-  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > > > > > >,
-  const TensorShufflingOp<const array<typename internal::traits<OutputBackward>::Index, 4>, const TensorReverseOp<const array<bool, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 2>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorReshapingOp<const DSizes<typename internal::traits<OutputBackward>::Index, 4>, const TensorImagePatchOp<Dynamic, Dynamic, const OutputBackward> > >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 3>, const Input> > > > > >::type
-SpatialConvolutionBackwardKernel(const Input& input, const OutputBackward& output_backward, typename internal::traits<Input>::Index kernelRows, typename internal::traits<Input>::Index kernelCols, const DenseIndex stride = 1, const DenseIndex in_stride = 1) {
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-  TensorRef<Tensor<typename internal::traits<OutputBackward>::Scalar, internal::traits<OutputBackward>::NumDimensions, internal::traits<OutputBackward>::Layout, TensorIndex> > out(output_backward);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<OutputBackward>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  // stride and in_stride cannot both be larger than 1
-  eigen_assert(!(stride > 1 && in_stride > 1));
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == internal::traits<OutputBackward>::NumDimensions, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  const TensorIndex inputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-
-  const TensorIndex outputRows = isColMajor ? output_backward.dimension(1) : output_backward.dimension(NumDims - 2);
-  const TensorIndex outputCols = isColMajor ? output_backward.dimension(2) : output_backward.dimension(NumDims - 3);
-
-  // Number of filters to apply. This is the same as the output depth of the result
-  const TensorIndex kernelFilters = isColMajor ? out.dimensions()[0] : out.dimensions()[NumDims - 1];
-
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels = isColMajor ? in.dimensions()[0] : in.dimensions()[NumDims - 1];
-
-  // This is the effective kernel size, taking into account the (in_stride - 1) zero-values
-  // inserted between consecutive kernel elements in atrous convolution
-  const TensorIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
-  const TensorIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
-
-  // Computing the forward padding
-  const TensorIndex forward_pad_top = ((outputRows - 1) * stride + kernelRowsEff - inputRows) / 2;
-  const TensorIndex forward_pad_left = ((outputCols - 1) * stride + kernelColsEff - inputCols) / 2;
-
-  // TODO: factor out the padding computation.
-  const TensorIndex padding_top = kernelRowsEff - 1 - forward_pad_top;
-  const TensorIndex padding_left = kernelColsEff - 1 - forward_pad_left;
-  const TensorIndex padding_bottom = inputRows + kernelRowsEff - 1 - (outputRows - 1) * stride - 1 - padding_top;
-  const TensorIndex padding_right = inputCols + kernelColsEff - 1 - (outputCols - 1) * stride - 1 - padding_left;
-
-  eigen_assert(padding_top >= 0);
-  eigen_assert(padding_left >= 0);
-  eigen_assert(padding_bottom >= 0);
-  eigen_assert(padding_right >= 0);
-
-  // The output_backward has dimensions out_depth X out_rows X out_cols X OTHERS
-  // When we extract the image patches from output_backward (with input as the
-  // kernel), it will have dimensions
-  //  (out_depth) X (input_rows * input_cols) X (kernel_rows * kernel_cols) X OTHERS
-  DSizes<TensorIndex, 4> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelFilters;
-    pre_contract_dims[1] = inputRows * inputCols;
-    pre_contract_dims[2] = kernelRows * kernelCols;
-    pre_contract_dims[3] = 1;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[3] *= out.dimension(i);
-    }
-  } else {
-    pre_contract_dims[3] = kernelFilters;
-    pre_contract_dims[2] = inputRows * inputCols;
-    pre_contract_dims[1] = kernelRows * kernelCols;
-    pre_contract_dims[0] = 1;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= out.dimension(i);
-    }
-  }
-
-  // The input has dimensions in_depth X (input_rows * input_cols) X OTHERS
-  DSizes<TensorIndex, 3> input_dims;
-  if (isColMajor) {
-    input_dims[0] = kernelChannels;
-    input_dims[1] = inputRows * inputCols;
-    input_dims[2] = 1;
-    for (int i = 3; i < NumDims; ++i) {
-      input_dims[2] *= in.dimension(i);
-    }
-    eigen_assert(input_dims[2] == pre_contract_dims[3]);
-  } else {
-    input_dims[2] = kernelChannels;
-    input_dims[1] = inputRows * inputCols;
-    input_dims[0] = 1;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      input_dims[0] *= in.dimension(i);
-    }
-    eigen_assert(input_dims[0] == pre_contract_dims[0]);
-  }
-
-  // We will contract along dimensions (1, 2) in and (1, 3) in out, if
-  // this is col-major.
-  // For row-major, it's dimensions (0, 1) in and (0, 2) in out.
-  array<IndexPair<TensorIndex>, 2> contract_dims;
-  if (isColMajor) {
-    // col-major: in.contract(output.patches)
-    contract_dims[0] = IndexPair<TensorIndex>(1, 1);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 3);
-  } else {
-    // row-major: output.patches.contract(in)
-    contract_dims[0] = IndexPair<TensorIndex>(0, 0);
-    contract_dims[1] = IndexPair<TensorIndex>(2, 1);
-  }
-
-  // After the contraction, the kernel will have dimension
-  // in_depth X out_depth X kernel_rows X kernel_cols
-  // We will need to shuffle the first two dimensions and reverse the latter
-  // two dimensions.
-  // The end shape is
-  // out_depth X in_shape X kernel_rows X kernel_cols
-
-  // This is the shape of the kernel *before* the shuffling.
-  DSizes<TensorIndex, 4> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelChannels;
-    kernel_dims[1] = kernelFilters;
-    kernel_dims[2] = kernelRows;
-    kernel_dims[3] = kernelCols;
-  } else {
-    kernel_dims[0] = kernelCols;
-    kernel_dims[1] = kernelRows;
-    kernel_dims[2] = kernelFilters;
-    kernel_dims[3] = kernelChannels;
-  }
-
-  array<TensorIndex, 4> kernel_shuffle;
-  if (isColMajor) {
-    kernel_shuffle[0] = 1;
-    kernel_shuffle[1] = 0;
-    kernel_shuffle[2] = 2;
-    kernel_shuffle[3] = 3;
-  } else {
-    kernel_shuffle[0] = 0;
-    kernel_shuffle[1] = 1;
-    kernel_shuffle[2] = 3;
-    kernel_shuffle[3] = 2;
-  }
-
-  array<bool, 4> kernel_reverse;
-  if (isColMajor) {
-    kernel_reverse[0] = false;
-    kernel_reverse[1] = false;
-    kernel_reverse[2] = true;
-    kernel_reverse[3] = true;
-  } else {
-    kernel_reverse[0] = true;
-    kernel_reverse[1] = true;
-    kernel_reverse[2] = false;
-    kernel_reverse[3] = false;
-  }
-
-  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
-                input.reshape(input_dims).contract(output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle),
-                output_backward.extract_image_patches(inputRows, inputCols, in_stride, in_stride, 1, 1, stride, stride, padding_top, padding_bottom, padding_left, padding_right, 0).reshape(pre_contract_dims).reshape(pre_contract_dims).contract(input.reshape(input_dims), contract_dims).reshape(kernel_dims).reverse(kernel_reverse).shuffle(kernel_shuffle));
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_BACKWARD_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
deleted file mode 100644
index dfb9dcedba901570e56e9c736fc4d84bbef37e2e..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/CuboidConvolution.h
+++ /dev/null
@@ -1,179 +0,0 @@
-#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
-#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
-
-#include "Patch3d.h"
-
-namespace Eigen {
-
-/** CuboidConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 3D convolution over a multichannel input voxel block.
-  *
-  * The input parameter is expected to be a tensor with a rank of 4 or more (channels, depth, height, width, and optionally others).
-  * The kernel parameter is expected to be a 5D tensor (filters, channels, kernel_depth, kernel_height, kernel_width).
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, depth, height, width (and others if applicable).
-  *
-  * The input and kernel have to be in the same layout, and both row-major and
-  * col-major are supported. The shapes given above are for col-major layout.
-  * For row-major, all dimensions should be reversed.
-  *
-  * It is possible to swap the order of the depth, width, and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  */
-template <typename Input, typename Kernel>
-EIGEN_ALWAYS_INLINE
-static const typename internal::conditional <
-    internal::traits<Input>::Layout == ColMajor,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const Kernel>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
-                                          const Input> > > >,
-    TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions>,
-        const TensorContractionOp<
-            const array<IndexPair<typename internal::traits<Input>::Index>, 1>,
-            const TensorReshapingOp<
-                const DSizes<typename internal::traits<Input>::Index, 2>,
-                const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic,
-                                          const Input> > ,
-                const TensorReshapingOp<
-                    const DSizes<typename internal::traits<Input>::Index, 2>,
-                    const Kernel> > > >::type
-CuboidConvolution(const Input& input, const Kernel& kernel,
-                  const DenseIndex stridePlanes = 1,
-                  const DenseIndex strideRows = 1,
-                  const DenseIndex strideCols = 1,
-                  const PaddingType padding_type = PADDING_SAME) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the result.
-  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[4];
-  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[3];
-
-  // Spatial size of the kernel.
-  const TensorIndex kernelDepth = isColMajor ? kern.dimensions()[2] : kern.dimensions()[2];
-  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[3] : kern.dimensions()[1];
-  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[4] : kern.dimensions()[0];
-
-  if (isColMajor) {
-    eigen_assert(kernelChannels == in.dimension(0));
-  } else {
-    eigen_assert(kernelChannels == in.dimension(NumDims - 1));
-  }
-
-  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  const float stride_planes_f = static_cast<float>(stridePlanes);
-  const float stride_rows_f = static_cast<float>(strideRows);
-  const float stride_cols_f = static_cast<float>(strideCols);
-  TensorIndex out_depth;
-  TensorIndex out_height;
-  TensorIndex out_width;
-  switch (padding_type) {
-    case PADDING_VALID:
-      out_depth = ceil((inputPlanes - kernelDepth + 1.f) / stride_planes_f);
-      out_height = ceil((inputRows - kernelRows + 1.f) / stride_rows_f);
-      out_width = ceil((inputCols - kernelCols + 1.f) / stride_cols_f);
-      break;
-    case PADDING_SAME:
-      out_depth = ceil(inputPlanes / stride_planes_f);
-      out_height = ceil(inputRows / stride_rows_f);
-      out_width = ceil(inputCols / stride_cols_f);
-      break;
-    default:
-      eigen_assert(false && "unexpected padding");
-  }
-
-  DSizes<TensorIndex, 2> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
-    kernel_dims[1] = kernelFilters;
-  }
-
-  // Molds the output of the patch extraction result into a 2D tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
-  // - the second dimension (dims[1]): everything else
-  DSizes<TensorIndex, 2> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelChannels * kernelDepth * kernelRows * kernelCols;
-    pre_contract_dims[1] = out_depth * out_height * out_width;
-    for (int i = 4; i < NumDims; ++i) {
-      pre_contract_dims[1] *= in.dimension(i);
-    }
-  } else {
-    pre_contract_dims[1] = kernelChannels * kernelDepth * kernelRows * kernelCols;
-    pre_contract_dims[0] = out_depth * out_height * out_width;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      pre_contract_dims[0] *= in.dimension(i);
-    }
-  }
-
-  array<IndexPair<TensorIndex>, 1> contract_dims;
-  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-
-  // Molds the output of the contraction into the shape expected by the user
-  // (assuming ColMajor):
-  // - 1st dim: kernel filters
-  // - 2nd dim: output depth
-  // - 3nd dim: output height
-  // - 4rd dim: output width
-  // - 5th dim and beyond: everything else including batch size
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelFilters;
-    post_contract_dims[1] = out_depth;
-    post_contract_dims[2] = out_height;
-    post_contract_dims[3] = out_width;
-    for (int i = 4; i < NumDims; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelFilters;
-    post_contract_dims[NumDims - 2] = out_depth;
-    post_contract_dims[NumDims - 3] = out_height;
-    post_contract_dims[NumDims - 4] = out_width;
-    for (int i = 0; i < NumDims - 4; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  }
-
-  return choose(
-      Cond<internal::traits<Input>::Layout == ColMajor>(),
-      kernel.reshape(kernel_dims)
-          .contract(input.extract_volume_patches(
-                             kernelDepth, kernelRows, kernelCols, stridePlanes,
-                             strideRows, strideCols, padding_type)
-                        .reshape(pre_contract_dims),
-                    contract_dims)
-          .reshape(post_contract_dims),
-      input.extract_volume_patches(kernelDepth, kernelRows, kernelCols,
-                                   stridePlanes, strideRows, strideCols,
-                                   padding_type)
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims));
-}
-
-} // end namespace Eigen
-
-#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_CUBOID_CONVOLUTION_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
deleted file mode 100644
index 2864f8329990325c73aadb32018ae975809cb09d..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Patch3d.h
+++ /dev/null
@@ -1,240 +0,0 @@
-#ifndef EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
-#define EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
-
-#if not defined(__CUDACC__)
-#include <type_traits>
-#endif
-
-namespace Eigen {
-namespace internal {
-
-/** Extract3DPatches
- * \ingroup CXX11_NeuralNetworksModule
- *
- * \brief Extracts 3D patches from a multichannel input volume.
- *
- * The input parameter is expected to be a tensor with a rank of 4 or more
- * (channels, depth, height, width, optional others in col-major, and the
- * reverse order in row-major).
-
- * The return value will be a tensor of 3 more dimension than the input tensor.
- * In col-major, the first 4 dimensions of the result are: channels, patch_depth,
- * patch_height, patch_width. The next dimensions will identify the patch
- * position on the 3D grid of extracted patches: z, y, x. The remaining
- * dimensions, if any, will be the same as the 'other' dimensions of the input
- * tensor.
- */
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorStridingOp<
-    const array<typename internal::traits<Input>::Index,
-                internal::traits<Input>::NumDimensions + 3>,
-    const TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions + 3>,
-        const TensorPatchOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorPaddingOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            internal::traits<Input>::NumDimensions>,
-                const Input> > > >
-Extract3DPatches(
-    const Input& input, const DenseIndex patchPlanes,
-    const DenseIndex patchRows, const DenseIndex patchCols,
-    const DenseIndex stridePlanes, const DenseIndex strideRows,
-    const DenseIndex strideCols,
-    const DenseIndex paddingZTop, const DenseIndex paddingZBottom,
-    const DenseIndex paddingTop, const DenseIndex paddingBottom,
-    const DenseIndex paddingLeft, const DenseIndex paddingRight,
-    const typename internal::traits<Input>::Scalar padding_value = 0) {
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-  static const int ExtDims = NumDims + 3;
-
-  // Tensor size after patch extraction. We add three dimensions to unpack the
-  // linear patch index into a 3D grid over which stride() can work.
-  DSizes<TensorIndex, ExtDims> pre_stride_dims;
-
-  if (isColMajor) {
-    pre_stride_dims[0] = in.dimension(0);
-    pre_stride_dims[1] = patchPlanes;
-    pre_stride_dims[2] = patchRows;
-    pre_stride_dims[3] = patchCols;
-  } else {
-    pre_stride_dims[ExtDims - 1] = in.dimension(NumDims - 1);
-    pre_stride_dims[ExtDims - 4] = patchCols;
-    pre_stride_dims[ExtDims - 3] = patchRows;
-    pre_stride_dims[ExtDims - 2] = patchPlanes;
-  }
-
-  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  array<IndexPair<TensorIndex>, NumDims> paddings;
-  for (int i = 0; i < NumDims; ++i) {
-    paddings[i] = IndexPair<TensorIndex>(0, 0);
-  }
-
-  paddings[isColMajor ? 1 : (NumDims - 2)] = IndexPair<TensorIndex>(paddingZTop, paddingZBottom);
-  paddings[isColMajor ? 2 : (NumDims - 3)] = IndexPair<TensorIndex>(paddingTop, paddingBottom);
-  paddings[isColMajor ? 3 : (NumDims - 4)] = IndexPair<TensorIndex>(paddingLeft, paddingRight);
-
-  pre_stride_dims[isColMajor ? 4 : (ExtDims - 5)] = inputPlanes + paddingZBottom + paddingZTop - patchPlanes + 1;
-  pre_stride_dims[isColMajor ? 5 : (ExtDims - 6)] = inputRows + paddingTop + paddingBottom - patchRows + 1;
-  pre_stride_dims[isColMajor ? 6 : (ExtDims - 7)] = inputCols + paddingLeft + paddingRight - patchCols + 1;
-
-  if (isColMajor) {
-    for (int i = 7; i < NumDims + 3; ++i) {
-      pre_stride_dims[i] = in.dimension(i - 3);
-    }
-  } else {
-    for (int i = 0; i < NumDims - 4; ++i) {
-      pre_stride_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, NumDims> patch_dims;
-  if (isColMajor) {
-    patch_dims[0] = in.dimension(0);
-    patch_dims[1] = patchPlanes;
-    patch_dims[2] = patchRows;
-    patch_dims[3] = patchCols;
-    for (int i = 4; i < NumDims; ++i) {
-      patch_dims[i] = 1;
-    }
-  } else {
-    patch_dims[NumDims - 1] = in.dimension(NumDims - 1);
-    patch_dims[NumDims - 4] = patchCols;
-    patch_dims[NumDims - 3] = patchRows;
-    patch_dims[NumDims - 2] = patchPlanes;
-    for (int i = 0; i < NumDims - 4; i++) {
-      patch_dims[i] = 1;
-    }
-  }
-
-  array<TensorIndex, NumDims + 3> strides;
-  if (isColMajor) {
-    // No striding within the patches.
-    for (int i = 0; i < 4; ++i) {
-      strides[i] = 1;
-    }
-    // Apply striding in the spatial patch grid dimensions only.
-    strides[4] = stridePlanes;
-    strides[5] = strideRows;
-    strides[6] = strideCols;
-    // No striding in the remaining dimensions (batches, ...).
-    for (int i = 7; i < NumDims + 3; i++) {
-      strides[i] = 1;
-    }
-  } else {
-    // No striding within the patches.
-    for (int i = 1; i <= 4; ++i) {
-      strides[ExtDims - i] = 1;
-    }
-    // Apply striding in the spatial patch grid dimensions only.
-    strides[ExtDims - 7] = strideCols;
-    strides[ExtDims - 6] = strideRows;
-    strides[ExtDims - 5] = stridePlanes;
-    // No striding in the remaining dimensions (batches, ...).
-    for (int i = 0; i < NumDims - 4; i++) {
-      strides[i] = 1;
-    }
-  }
-
-  // TODO(mjanusz): Consider getting rid of pad(), and stride() and extend
-  // extract_patches to take additional parameters for padding/striding,
-  // similarly to extract_image_patches.
-  return input.pad(paddings, padding_value).extract_patches(patch_dims).reshape(pre_stride_dims).stride(strides);
-}
-
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorStridingOp<
-    const array<typename internal::traits<Input>::Index,
-                internal::traits<Input>::NumDimensions + 3>,
-    const TensorReshapingOp<
-        const DSizes<typename internal::traits<Input>::Index,
-                     internal::traits<Input>::NumDimensions + 3>,
-        const TensorPatchOp<
-            const DSizes<typename internal::traits<Input>::Index,
-                         internal::traits<Input>::NumDimensions>,
-            const TensorPaddingOp<
-                const array<IndexPair<typename internal::traits<Input>::Index>,
-                            internal::traits<Input>::NumDimensions>,
-                const Input> > > >
-Extract3DPatches(
-    const Input& input, const DenseIndex patchPlanes,
-    const DenseIndex patchRows, const DenseIndex patchCols,
-    const DenseIndex stridePlanes, const DenseIndex strideRows,
-    const DenseIndex strideCols, const PaddingType padding_type,
-    const typename internal::traits<Input>::Scalar padding_value = 0) {
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-
-  const TensorIndex inputPlanes = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex inputRows = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-  const TensorIndex inputCols = isColMajor ? in.dimension(3) : in.dimension(NumDims - 4);
-
-  switch (padding_type) {
-    case PADDING_VALID:
-      // No padding in any dimension.
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols,
-                              0, 0, 0, 0, 0, 0, padding_value);
-    case PADDING_SAME: {
-      // The side of the tensor before striding should be just the expected
-      // output times the stride.
-      const TensorIndex size_z = ceil(inputPlanes / static_cast<float>(stridePlanes)) * stridePlanes;
-      const TensorIndex size_y = ceil(inputRows / static_cast<float>(strideRows)) * strideRows;
-      const TensorIndex size_x = ceil(inputCols / static_cast<float>(strideCols)) * strideCols;
-
-      // The size of the patch space is going to be: padded_input_size - patch_size + 1.
-      // This has to match the expected size before striding (pre_stride_dims).
-      // The deltas below extend the input to the expected size.
-      const TensorIndex dz = size_z + patchPlanes - 1 - inputPlanes;
-      const TensorIndex dy = size_y + patchRows - 1 - inputRows;
-      const TensorIndex dx = size_x + patchCols - 1 - inputCols;
-
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols,
-                              dz - dz / 2, dz / 2,
-                              dy - dy / 2, dy / 2,
-                              dx - dx / 2, dx / 2,
-                              padding_value);
-    }
-    default:
-      eigen_assert(false && "unexpected padding");
-      // unreachable code to avoid missing return warning.
-      return Extract3DPatches(input, patchPlanes, patchRows, patchCols,
-                              stridePlanes, strideRows, strideCols,
-                              0, 0, 0, 0, 0, 0, padding_value);
-  }
-}
-
-// TODO(mjanusz): Switch this to a 'using' alias once CUDA supports C++11.
-template <typename Input>
-struct Extract3DPatchesType {
-  typedef const TensorStridingOp< const array<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
-      const TensorReshapingOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions + 3>,
-      const TensorPatchOp< const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>,
-      const TensorPaddingOp< const array< IndexPair<typename internal::traits<Input>::Index>, internal::traits<Input>::NumDimensions>,
-      const Input> > > > type;
-};
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_SRC_NEURAL_NETWORKS_PATCH3D_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
deleted file mode 100644
index 942b060ba761a2b31e6affc2d3714564ef243134..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/Pooling.h
+++ /dev/null
@@ -1,433 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
-
-#include "Patch3d.h"
-
-namespace Eigen {
-
-/** SpatialMaxPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a max-pooling over a multichannel input image.
-  *
-  * The input parameter is expected to be a with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the width and height dimensions can be swapped if needed.
-  *
-*/
-#if !defined(EIGEN_HAS_INDEX_LIST)
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
-#else
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::MaxReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
-#endif
-SpatialMaxPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
-                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
-                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
-  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int idxRows = isColMajor ? 1 : 2;
-  static const int idxCols = isColMajor ? 2 : 1;
-
-  // Molds the output of the reduction into the shape expected by the user.
-  // (assuming col-major):
-  // - 1st dim: channels
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
-  post_reduce_dims[0] = in.dimension(0);
-  if (padding_type == PADDING_VALID) {
-    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
-  } else {
-    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
-  }
-  post_reduce_dims[3] = in.dimension(3);
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-  // nvcc doesn't support cxx11
-  Eigen::array<int, 2> reduction_dims;
-  if (isColMajor) {
-    reduction_dims[0] = 1;
-    reduction_dims[1] = 2;
-  } else {
-    reduction_dims[0] = 2;
-    reduction_dims[1] = 3;
-  }
-#else
-  // Take advantage of cxx11 to give the compiler information it can use to
-  // optimize the code.
-  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
-#endif
-
-  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).maximum(reduction_dims).reshape(post_reduce_dims);
-}
-
-/** CuboidMaxPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a max-pooling over a multichannel input volume.
-  *
-  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others in col-major, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, height, width, and others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the depth, width and height dimensions can be swapped if needed.
-  *
-*/
-#if !defined(EIGEN_HAS_INDEX_LIST)
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
-    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
-    const TensorReductionOp<
-        internal::MaxReducer<float>, const Eigen::array<int, 1>,
-        const TensorReshapingOp<
-            const Eigen::DSizes<DenseIndex, 3>,
-            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
-#else
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
-    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
-    const TensorReductionOp<
-        internal::MaxReducer<float>,
-        const Eigen::IndexList<Eigen::type2index<1> >,
-        const TensorReshapingOp<
-            const Eigen::DSizes<DenseIndex, 3>,
-            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
-#endif
-CuboidMaxPooling(const Input& input, DenseIndex patchPlanes,
-                 DenseIndex patchRows, DenseIndex patchCols,
-                 DenseIndex stridePlanes, DenseIndex strideRows,
-                 DenseIndex strideCols, const PaddingType padding_type) {
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  static const int idxPlanes = isColMajor ? 1 : 3;
-  static const int idxRows = 2;
-  static const int idxCols = isColMajor ? 3 : 1;
-
-  // Molds the output of the reduction into the shape expected by the used
-  // (assuming col-major):
-  // - 1st dim: channels
-  // - 2nd dim: output depth
-  // - 3rd dim: output height
-  // - 4th dim: output width
-  // - 5th dim and beyond: everything else including batch size
-  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
-  post_reduce_dims[0] = in.dimension(0);
-  if (padding_type == PADDING_VALID) {
-    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
-    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
-  } else {
-    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
-    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
-  }
-  post_reduce_dims[4] = in.dimension(4);
-
-  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
-  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
-  if (isColMajor) {
-    pre_reduce_dims[0] = post_reduce_dims[0];
-    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
-  } else {
-    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
-    pre_reduce_dims[2] = post_reduce_dims[4];
-  }
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-  // nvcc doesn't support cxx11
-  Eigen::array<int, 1> reduction_dims;
-  reduction_dims[0] = 1;
-#else
-  // Take advantage of cxx11 to give the compiler information it can use to
-  // optimize the code.
-  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
-#endif
-  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
-                                      stridePlanes, strideRows, strideCols,
-                                      padding_type, -Eigen::NumTraits<float>::highest())
-      .reshape(pre_reduce_dims)
-      .maximum(reduction_dims)
-      .reshape(post_reduce_dims);
-}
-
-
-/** SpatialAvgPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies an average pooling over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 4 (channels, height, width, others in col-major, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, height, width, and others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the width and height dimensions can be swapped if needed.
-  *
-*/
-namespace internal {
-
-template <typename T> struct AvgPoolMeanReducer
-{
-#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
-  // We only support packet access for floats.
-  static const bool PacketAccess = internal::is_same<T, float>::value;
-#else
-  static const bool PacketAccess = false;
-#endif
-  static const bool IsStateful = true;
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE AvgPoolMeanReducer() : scalarCount_(0) {
-    typedef typename packet_traits<T>::type Packet;
-    packetCount_ = pset1<Packet>(0.0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    if (t != -Eigen::NumTraits<T>::highest()) {
-      (*accum) = (*accum) + t;
-      scalarCount_++;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return static_cast<T>(0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    eigen_assert(scalarCount_ > 0);
-    return accum / scalarCount_;
-  }
-
-#if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
-#ifdef EIGEN_VECTORIZE_AVX
-#define pequal(a,b) _mm256_cmp_ps(a,b,_CMP_EQ_UQ)
-#define psel(a,b,false_mask) _mm256_blendv_ps(a,b,false_mask)
-#else
-#define pequal(a,b) _mm_cmpeq_ps(a,b)
-#define psel(a,b,false_mask) _mm_or_ps(_mm_andnot_ps(false_mask, a), _mm_and_ps(false_mask, b))
-#endif
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
-    reducePacketWithType(static_cast<T>(0), p, accum);
-  }
-
-  template <typename Packet>
-  void reducePacketWithType(T, const Packet& p, Packet* accum) {
-    Packet skip_mask = pequal(p, pset1<Packet>(-Eigen::NumTraits<T>::highest()));
-    (*accum) = padd<Packet>(*accum, psel(p, pset1<Packet>(0), skip_mask));
-    packetCount_ = padd<Packet>(packetCount_, psel(pset1<Packet>(1), pset1<Packet>(0), skip_mask));
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
-    return pset1<Packet>(0);
-  }
-
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, packetCount_);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
-    return (saccum + predux(vaccum)) / (scalarCount_ + predux(packetCount_));
-  }
-#endif
-
- protected:
-    typedef typename packet_traits<T>::type Packet;
-    int scalarCount_;
-    Packet packetCount_;
-};
-
-}  // namespace internal
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, const Eigen::array<int, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
-#else
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorReshapingOp<const Eigen::DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorReductionOp<internal::AvgPoolMeanReducer<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>, typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > >
-#endif
-SpatialAvgPooling(const Input& input, DenseIndex patchRows, DenseIndex patchCols,
-                  DenseIndex strideRows, DenseIndex strideCols, const PaddingType padding_type,
-                  DenseIndex in_strideRows = 1, DenseIndex in_strideCols = 1)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  const DenseIndex patchRowsEff = patchRows + (patchRows - 1) * (in_strideRows - 1);
-  const DenseIndex patchColsEff = patchCols + (patchCols - 1) * (in_strideCols - 1);
-
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-  static const int idxRows = isColMajor ? 1 : 2;
-  static const int idxCols = isColMajor ? 2 : 1;
-
-  // Molds the output of the reduction into the shape expected by the user.
-  // (assuming col-major):
-  // - 1st dim: channels
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  Eigen::DSizes<TensorIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
-  post_reduce_dims[0] = in.dimension(0);
-  if (padding_type == PADDING_VALID) {
-    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRowsEff + 1.f) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchColsEff + 1.f) / static_cast<float>(strideCols));
-  } else {
-    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
-  }
-  post_reduce_dims[3] = in.dimension(3);
-
-  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
-  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-  // nvcc doesn't support cxx11
-  Eigen::array<int, 2> reduction_dims;
-  if (isColMajor) {
-    reduction_dims[0] = 1;
-    reduction_dims[1] = 2;
-  } else {
-    reduction_dims[0] = 2;
-    reduction_dims[1] = 3;
-  }
-#else
-  // Take advantage of cxx11 to give the compiler information it can use to
-  // optimize the code.
-  typename internal::conditional<internal::traits<Input>::Layout == ColMajor, const Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> >, const Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3> > >::type reduction_dims;
-#endif
-  return input.extract_image_patches(patchRows, patchCols, strideRows, strideCols, in_strideRows, in_strideCols, padding_type, -Eigen::NumTraits<typename internal::remove_const<typename internal::traits<Input>::Scalar>::type>::highest()).reduce(reduction_dims, mean_with_nan).reshape(post_reduce_dims);
-}
-
-
-/** CuboidAvgPooling
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies an average pooling over a multichannel input volume.
-  *
-  * The input parameter is expected to be a tensor with a rank of 5 (channels, depth, height, width, others, and the reverse of that in row-major).
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be channels, depth, width, and others (in col-major, and the reverse of that if the input was row-major).
-  *
-  * The order of the depth, width and height dimensions can be swapped if needed.
-  *
-*/
-#if !defined(EIGEN_HAS_INDEX_LIST)
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
-    const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
-    const TensorReductionOp<
-        internal::AvgPoolMeanReducer<float>, const Eigen::array<int, 1>,
-        const TensorReshapingOp<
-            const Eigen::DSizes<DenseIndex, 3>,
-            const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
-#else
-template <typename Input>
-EIGEN_ALWAYS_INLINE static const TensorReshapingOp<
-      const Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions>,
-      const TensorReductionOp<
-          internal::AvgPoolMeanReducer<float>,
-          const Eigen::IndexList<Eigen::type2index<1> >,
-          const TensorReshapingOp<
-              const Eigen::DSizes<DenseIndex, 3>,
-              const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Input> > > >
-#endif
-CuboidAvgPooling(const Input& input, DenseIndex patchPlanes,
-                 DenseIndex patchRows, DenseIndex patchCols,
-                 DenseIndex stridePlanes, DenseIndex strideRows,
-                 DenseIndex strideCols, const PaddingType padding_type) {
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-
-  static const int idxPlanes = isColMajor ? 1 : 3;
-  static const int idxRows = 2;
-  static const int idxCols = isColMajor ? 3 : 1;
-  // Molds the output of the reduction into the shape expected by the used
-  // (assuming col-major):
-  // - 1st dim: channels
-  // - 2nd dim: outupt depth
-  // - 3rd dim: output height
-  // - 4th dim: output width
-  // - 5th dim and beyond: everything else including batch size
-  Eigen::DSizes<DenseIndex, internal::traits<Input>::NumDimensions> post_reduce_dims;
-  post_reduce_dims[0] = in.dimension(0);
-  if (padding_type == PADDING_VALID) {
-    post_reduce_dims[idxPlanes] = numext::ceil((in.dimension(idxPlanes) - patchPlanes + 1.f) / static_cast<float>(stridePlanes));
-    post_reduce_dims[idxRows] = numext::ceil((in.dimension(idxRows) - patchRows + 1.f) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil((in.dimension(idxCols) - patchCols + 1.f) / static_cast<float>(strideCols));
-  } else {
-    post_reduce_dims[idxPlanes] = numext::ceil(in.dimension(idxPlanes) / static_cast<float>(stridePlanes));
-    post_reduce_dims[idxRows] = numext::ceil(in.dimension(idxRows) / static_cast<float>(strideRows));
-    post_reduce_dims[idxCols] = numext::ceil(in.dimension(idxCols) / static_cast<float>(strideCols));
-  }
-  post_reduce_dims[4] = in.dimension(4);
-
-  Eigen::DSizes<DenseIndex, 3> pre_reduce_dims;
-  pre_reduce_dims[1] = patchRows * patchCols * patchPlanes;
-  if (isColMajor) {
-    pre_reduce_dims[0] = post_reduce_dims[0];
-    pre_reduce_dims[2] = post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3] * post_reduce_dims[4];
-  } else {
-    pre_reduce_dims[0] = post_reduce_dims[0] * post_reduce_dims[1] * post_reduce_dims[2] * post_reduce_dims[3];
-    pre_reduce_dims[2] = post_reduce_dims[4];
-  }
-
-  typedef typename internal::remove_const<typename internal::traits<Input>::Scalar>::type CoeffReturnType;
-  internal::AvgPoolMeanReducer<CoeffReturnType> mean_with_nan;
-
-#if !defined(EIGEN_HAS_INDEX_LIST)
-  // nvcc doesn't support cxx11
-  Eigen::array<int, 1> reduction_dims;
-  reduction_dims[0] = 1;
-#else
-  // Take advantage of cxx11 to give the compiler information it can use to
-  // optimize the code.
-  Eigen::IndexList<Eigen::type2index<1> > reduction_dims;
-#endif
-  return input.extract_volume_patches(patchPlanes, patchRows, patchCols,
-                                      stridePlanes, strideRows, strideCols,
-                                      padding_type, -Eigen::NumTraits<float>::highest())
-      .reshape(pre_reduce_dims)
-      .reduce(reduction_dims, mean_with_nan)
-      .reshape(post_reduce_dims);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_POOLING_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
deleted file mode 100644
index f0e21ab9c2eda60813db95583f14d2cf76a38700..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SoftMax.h
+++ /dev/null
@@ -1,83 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
-
-namespace Eigen {
-
-/** SoftMax
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a softmax
-  *
-  * The input parameter is expected to be a col-major tensor with a rank of 2 (depth and other).
-  *
-  * The result can be assigned to a tensor of rank and dimensions equal to that of the input. The result will be laid out in col-major order.
-  *
-*/
-
-namespace {
-class SoftmaxOp {
- public:
-  EIGEN_ALWAYS_INLINE SoftmaxOp(const float beta) : beta_(beta) { }
-
-  template <typename Input> EIGEN_ALWAYS_INLINE
-  typename Input::Dimensions dimensions(const Input& input) const {
-    return input.dimensions();
-  }
-
-  template <typename Input, typename Output, typename Device>
-  void eval(const Input& input, Output& output, const Device& device) const
-  {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    // nvcc doesn't support cxx11
-    Eigen::array<typename internal::traits<Input>::Index, 1> depth_dim;
-    depth_dim[0] = 0;
-    Eigen::array<typename internal::traits<Input>::Index, 2> bcast;
-    bcast[0] = dimensions(input)[0];
-    bcast[1] = 1;
-    DSizes<typename internal::traits<Input>::Index, 2> dims2d;
-    dims2d[0] = 1;
-    dims2d[1] = dimensions(input)[1];
-#else
-    // Take advantage of cxx11 to give the compiler information it can use to
-    // optimize the code.
-    Eigen::IndexList<Eigen::type2index<0>> depth_dim;
-    Eigen::IndexList<int, Eigen::type2index<1>> bcast;
-    bcast.set(0, dimensions(input)[0]);
-    Eigen::IndexList<Eigen::type2index<1>, typename internal::traits<Input>::Index> dims2d;
-    dims2d.set(1, dimensions(input)[1]);
-#endif
-
-    output.device(device) = ((input - input.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast)) * beta_).exp();
-    output.device(device) = output / (output.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
-  }
-
- private:
-  const float beta_;
-};
-}
-
-
-template <typename Input>
-EIGEN_ALWAYS_INLINE
-static const TensorCustomUnaryOp<const SoftmaxOp, const Input>
-SoftMax(const Input& input, const float beta)
-{
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-  const SoftmaxOp op(beta);
-  return input.customOp(op);
-}
-
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_SOFTMAX_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
deleted file mode 100644
index 8e2ddca6b5d0dabe63783049bb60e6699e682cb7..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/SpatialConvolutions.h
+++ /dev/null
@@ -1,775 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-#ifndef EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
-#define EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// These optimizations require vector instructions
-#ifdef EIGEN_VECTORIZE
-
-// TODO: Consolidate this part of the code with the image patch extraction code
-// since they are both very similar.
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
-          typename Scalar_, typename Index,
-          typename nocontract_t, typename contract_t,
-          int Side, size_t packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-{
- public:
-  typedef TensorContractionInputMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
-  typedef TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper VectorMapper;
-  typedef SubMapper LinearMapper;
-  typedef Scalar_ Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  TensorContractionInputMapper(const TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>& tensor,
-                               const nocontract_t&, const nocontract_t&,
-                               const contract_t&, const contract_t&)
-      : m_impl(tensor.impl().impl())
-  {
-    Index patch_rows;
-    Index patch_depth;
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      patch_depth = tensor.impl().dimensions()[0];
-      patch_rows = tensor.impl().dimensions()[1];
-      m_patch_cols = tensor.impl().dimensions()[2];
-      m_num_patches = tensor.impl().dimensions()[3];
-    } else {
-      static const int NumDims = tensor.impl().dimensions().size();
-      patch_depth = tensor.impl().dimensions()[NumDims - 1];
-      patch_rows = tensor.impl().dimensions()[NumDims - 2];
-      m_patch_cols = tensor.impl().dimensions()[NumDims - 3];
-      m_num_patches = tensor.impl().dimensions()[NumDims - 4];
-    }
-    m_patch_row_inflate_strides = tensor.impl().rowInflateStride();
-    m_patch_col_inflate_strides = tensor.impl().colInflateStride();
-
-    m_colStride = patch_rows;
-
-    m_outputRows = tensor.impl().outputRows();
-    m_row_strides = tensor.impl().userRowStride();
-    m_col_strides = tensor.impl().userColStride();
-
-    m_in_row_strides = tensor.impl().userInRowStride();
-    m_in_col_strides = tensor.impl().userInColStride();
-
-    if (internal::traits<ArgType>::Layout == ColMajor) {
-      m_inputRows = tensor.impl().impl().dimensions()[1];
-      m_inputCols = tensor.impl().impl().dimensions()[2];
-    } else {
-      static const int NumDims = tensor.impl().impl().dimensions().size();
-      m_inputRows = tensor.impl().impl().dimensions()[NumDims - 2];
-      m_inputCols = tensor.impl().impl().dimensions()[NumDims - 3];
-    }
-
-    m_rowInputStride = patch_depth;
-    m_colInputStride = patch_depth * m_inputRows;
-    m_patchInputStride = patch_depth * m_inputRows * m_inputCols;
-
-    m_rowPaddingTop = tensor.impl().rowPaddingTop();
-    m_colPaddingLeft = tensor.impl().colPaddingLeft();
-
-    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_patch_row_inflate_strides);
-    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_patch_col_inflate_strides);
-    m_fastNumPatches = internal::TensorIntDivisor<Index>(m_num_patches);
-    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
-    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
-    m_fastDimZero = internal::TensorIntDivisor<Index>(patch_depth);
-  }
-
-  TensorContractionInputMapper(const TensorContractionInputMapper& base_mapper) :
-      m_impl(base_mapper.m_impl) {
-    m_patch_cols = base_mapper.m_patch_cols;
-    m_num_patches = base_mapper.m_num_patches;
-    m_patch_row_inflate_strides = base_mapper.m_patch_row_inflate_strides;
-    m_patch_col_inflate_strides = base_mapper.m_patch_col_inflate_strides;
-
-    m_colStride = base_mapper.m_colStride;
-
-    m_rowInputStride = base_mapper.m_rowInputStride;
-    m_colInputStride = base_mapper.m_colInputStride;
-    m_patchInputStride = base_mapper.m_patchInputStride;
-
-    m_inputRows = base_mapper.m_inputRows;
-    m_inputCols = base_mapper.m_inputCols;
-
-    m_outputRows = base_mapper.m_outputRows;
-    m_row_strides = base_mapper.m_row_strides;
-    m_col_strides = base_mapper.m_col_strides;
-
-    m_in_row_strides = base_mapper.m_in_row_strides;
-    m_in_col_strides = base_mapper.m_in_col_strides;
-
-    m_rowPaddingTop = base_mapper.m_rowPaddingTop;
-    m_colPaddingLeft = base_mapper.m_colPaddingLeft;
-
-    m_fastInputRowStride = base_mapper.m_fastInputRowStride;
-    m_fastInputColStride = base_mapper.m_fastInputColStride;
-    m_fastNumPatches = base_mapper.m_fastNumPatches;
-    m_fastColStride = base_mapper.m_fastColStride;
-    m_fastOutputRows = base_mapper.m_fastOutputRows;
-    m_fastDimZero = base_mapper.m_fastDimZero;
-  }
-
- // If true, turns off some optimizations for loading packets since the image
-  // patches are "non-standard" such as there are non-trivial strides or
-  // inflations in the input.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_in_row_strides != 1 || m_in_col_strides != 1 || m_patch_row_inflate_strides != 1 || m_patch_col_inflate_strides != 1;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
-    return SubMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(*this, i, j);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Scalar operator()(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the coefficient at the patchIndex location instead of the usual m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.  EIGEN_DEVICE_FUNC
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar operator()(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadCoeff(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(0, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  // Load the packet at the patchIndex location instead of the usual m_rowIndex,
-  // m_colIndex, m_otherIndex. This is currently only used by the gpu code.
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index row, Index patchIndex) const {
-    Index rowIndex, colIndex, otherIndex;
-    computeBaseIndices(patchIndex, rowIndex, colIndex, otherIndex);
-    return loadPacket(row, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_rowInputStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_colStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_patch_cols; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
- private:
-  friend class TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>;
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeff(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset * m_in_col_strides;
-    const Index origInputCol = (m_patch_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex + rowOffset * m_in_row_strides;
-    const Index origInputRow = (m_patch_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
-    if (origInputCol < 0 | origInputRow < 0 | origInputCol >= m_inputCols | origInputRow >= m_inputRows |
-        (inputCol != origInputCol * m_patch_col_inflate_strides) | (inputRow != origInputRow * m_patch_row_inflate_strides)) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE Scalar loadCoeffStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    eigen_assert(!nonStandardPatches());
-
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index rowOffset = patchOffset - colOffset * m_colStride;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 || inputCol >= m_inputCols || inputRow < 0 || inputRow >= m_inputRows) {
-      return Scalar(0);
-    }
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
-    return m_impl.coeff(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacket(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
-
-    if (nonStandardPatches()) {
-      return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-    }
-    return loadPacketStandard(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-
-    if ((patchDepth() % packetSize) == 0) {
-      return loadPacketFast(patchId, rowIndex, colIndex, otherIndex);
-    }
-    else {
-      const Index patchOffsets[2] = {patchId / m_fastDimZero, (patchId + packetSize - 1) / m_fastDimZero};
-
-      const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
-
-      const Index inputCols[2] = {colIndex + colOffsets[0], colIndex + colOffsets[1]};
-      if (inputCols[0] >= m_inputCols | inputCols[1] < 0) {
-        // all zeros
-        return internal::pset1<Packet>(Scalar(0));
-      }
-
-      if (inputCols[0] == inputCols[1]) {
-        const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
-        eigen_assert(rowOffsets[0] <= rowOffsets[1]);
-        const Index inputRows[2] = {rowIndex + rowOffsets[0], rowIndex + rowOffsets[1]};
-
-        if (inputRows[0] >= m_inputRows | inputRows[1] < 0) {
-          // all zeros
-          return internal::pset1<Packet>(Scalar(0));
-        }
-
-        if (inputRows[0] >= 0 & inputRows[1] < m_inputRows) {
-          // no padding
-          const Index depth = patchId - patchOffsets[0] * patchDepth();
-          const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex;
-          return m_impl.template packet<Unaligned>(inputIndex);
-        }
-      }
-    }
-    return packetWithPossibleZero(patchId, rowIndex, colIndex, otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const {
-    const Index packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(patchId < patchDepth()*patchRows()*m_patch_cols);
-
-    eigen_assert(!nonStandardPatches());
-    eigen_assert((patchDepth() % packetSize) == 0);
-    // Find the offset of the element wrt the location of the first element.
-    const Index patchOffset = patchId / m_fastDimZero;
-    eigen_assert((patchId + packetSize - 1)  / m_fastDimZero == patchOffset);
-
-    const Index colOffset = patchOffset / m_fastColStride;
-    const Index inputCol = colIndex + colOffset;
-    const Index rowOffset = patchOffset - colOffset*m_colStride;
-    const Index inputRow = rowIndex + rowOffset;
-    if (inputCol < 0 | inputRow < 0 | inputCol >= m_inputCols | inputRow >= m_inputRows) {
-      // all zeros
-      return internal::pset1<Packet>(Scalar(0));
-    }
-    // no padding
-    const Index depth = patchId - patchOffset * patchDepth();
-    const Index inputIndex = depth + inputRow * m_rowInputStride + inputCol * m_colInputStride + otherIndex;
-    return m_impl.template packet<Unaligned>(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet packetWithPossibleZero(Index patchId, Index rowIndex, Index colIndex, Index otherIndex) const
-  {
-    const int packetSize = internal::unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_MAX typename internal::remove_const<Scalar>::type values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = loadCoeff(patchId+i, rowIndex, colIndex, otherIndex);
-    }
-    Packet rslt = internal::pload<Packet>(values);
-    return rslt;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void computeBaseIndices(Index patchIndex, Index& rowIndex, Index& colIndex, Index& otherIndex) const {
-    const int NumInputDims = array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-    otherIndex = (NumInputDims == 3) ? 0 : patchIndex / m_fastNumPatches;
-    const Index patch2DIndex = (NumInputDims == 3) ? patchIndex : (patchIndex - otherIndex * m_num_patches);
-    otherIndex *= m_patchInputStride;
-    colIndex = patch2DIndex / m_fastOutputRows;
-    rowIndex = patch2DIndex - colIndex * m_outputRows;
-    colIndex = colIndex * m_col_strides - m_colPaddingLeft;
-    rowIndex = rowIndex * m_row_strides - m_rowPaddingTop;
-  }
-
-  Index m_patch_cols;    // number of colums in the patch
-  Index m_num_patches;   // number of patches to extract.
-  Index m_patch_row_inflate_strides;  // the strides for row inflation in the image patch
-  Index m_patch_col_inflate_strides;  // the strides for col inflation in the image patch
-  // Fast representation of inflation strides.
-  internal::TensorIntDivisor<Index> m_fastInputRowStride;
-  internal::TensorIntDivisor<Index> m_fastInputColStride;
-
-  Index m_otherStride;
-  Index m_colStride;
-  internal::TensorIntDivisor<Index> m_fastNumPatches;
-  internal::TensorIntDivisor<Index> m_fastColStride;
-
-  Index m_rowInputStride;     // row stride in the input tensor
-  Index m_colInputStride;     // col stride in the input tensor
-  Index m_patchInputStride;   // patch stride in the input tensor
-
-  Index m_inputRows;     // Number of rows in the input tensor
-  Index m_inputCols;     // Number of cols in the input tensor
-
-  Index m_outputRows;    // Number of patch rows
-
-  Index m_row_strides;   // User specified row stride
-  Index m_col_strides;   // User specified col stride
-
-  Index m_in_row_strides;  // User specified input row stride
-  Index m_in_col_strides;  // User specified input col stride
-
-  Index m_rowPaddingTop;    // Row padding
-  Index m_colPaddingLeft;   // Column padding
-
-  internal::TensorIntDivisor<Index> m_fastOutputRows;
-  internal::TensorIntDivisor<Index> m_fastDimZero;
-
-  const TensorEvaluator<ArgType, Device> m_impl;
-};
-
-
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
-          typename Scalar_, typename Index,
-          typename nocontract_t, typename contract_t,
-          int Side, size_t packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
-class TensorContractionSubMapper<Scalar_, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>
-{
- public:
-  typedef Scalar_ Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename packet_traits<Scalar>::half HalfPacket;
-
-  typedef TensorContractionInputMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
-  typedef Self LinearMapper;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper), m_depth_offset(vert_offset), m_col_offset(horiz_offset) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper.m_base_mapper), m_depth_offset(vert_offset+base_mapper.m_depth_offset), m_col_offset(horiz_offset+base_mapper.m_col_offset) {
-    m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
-    return m_base_mapper.loadCoeff(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
-    return m_base_mapper(i + m_depth_offset, j + m_col_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
-   return m_base_mapper.loadPacket(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
-    return m_base_mapper.template loadPacket(i + m_depth_offset, j + m_col_offset);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar loadCoeffStandard(Index i) const {
-    return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketFast(Index i) const {
-   return m_base_mapper.loadPacketFast(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacketStandard(Index i) const {
-   return m_base_mapper.loadPacketStandard(i + m_depth_offset, m_rowIndex, m_colIndex, m_otherIndex);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC bool aligned(Index) const {
-    return false;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool nonStandardPatches() const {
-    return m_base_mapper.nonStandardPatches();
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchDepth() const { return m_base_mapper.m_rowInputStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchRows() const { return m_base_mapper.m_colStride; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index patchCols() const { return m_base_mapper.m_patch_cols; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Packet packetNoPadding(const Index depth, const Index baseIndex) const {
-    const Index inputIndex = depth + baseIndex;
-    return m_base_mapper.m_impl.template packet<Unaligned>(inputIndex);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padRow(const Index row) const {
-    const Index r = m_rowIndex + row;
-    return r < 0 | r >= m_base_mapper.m_inputRows;
-  }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
-    const Index c = m_colIndex + col;
-    return c < 0 | c >= m_base_mapper.m_inputCols;
-    }
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index baseIndex(const Index row, const Index col) const {
-    const Index r = m_rowIndex + row;
-    const Index c = m_colIndex + col;
-    return r * m_base_mapper.m_rowInputStride + c * m_base_mapper.m_colInputStride + m_otherIndex;
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index rowOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return patchOffset-colOffset*m_base_mapper.m_colStride;
-    }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index colOffset() const {
-    const Index patchOffset = m_depth_offset / m_base_mapper.m_fastDimZero;
-    const Index colOffset = patchOffset / m_base_mapper.m_fastColStride;
-    return colOffset;
-    }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_ALWAYS_INLINE Index depthOffset() const {
-    const Index patchOffset = m_depth_offset % m_base_mapper.patchDepth();
-    return patchOffset;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
-    return LinearMapper(m_base_mapper, i + m_depth_offset, j + m_col_offset);
- }
-
- private:
-  const ParentMapper& m_base_mapper;  // that was a reference before
-  Index m_depth_offset;  // First row in the input matrix
-  Index m_col_offset;    // First col in the input matrix
-
-  Index m_rowIndex;        // precomputed row index corresponding to the col offset
-  Index m_colIndex;        // precomputed col index corresponding to the col offset
-  Index m_otherIndex;      // precomputed other index corresponding to the col offset
-
-};
-
-
-template <typename NewDimension, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device,
-          typename Scalar, typename Index,
-          typename nocontract_t, typename contract_t,
-          int Side, size_t packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, int nr>
-struct gemm_pack_rhs<Scalar, Index, TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment>, nr, ColMajor, false, false> {
-
-  typedef TensorContractionSubMapper<Scalar, Index, Side, TensorEvaluator<const TensorReshapingOp<NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, Device>, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
-  typedef SubMapper DataMapper;
-
-  static inline Index ceil_div(Index a, Index b) {
-    return (a + b - 1) / b;
-  }
-
-  EIGEN_DONT_INLINE void operator()(Scalar* block, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0) const {
-    eigen_assert(stride == 0);
-    eigen_assert(offset == 0);
-
-    EIGEN_STATIC_ASSERT((nr == 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    typedef typename DataMapper::LinearMapper LinearMapper;
-    typedef typename packet_traits<Scalar>::type Packet;
-
-    const Index packet_cols4 = (cols/4) * 4;
-    const Index peeled_k = (depth/packet_size) * packet_size;
-    const bool non_standard_patches = rhs.nonStandardPatches();
-
-    for(Index j2=0; j2<packet_cols4; j2+=4)
-    {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
-      const SubMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
-      const SubMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
-      const SubMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
-
-      Index k=0;
-      if((packet_size%4)==0 && !non_standard_patches)
-      {
-        const Index patch_depth = rhs.patchDepth();
-        if ((patch_depth % packet_size) == 0) {
-          const Index patch_cols = rhs.patchCols();
-          const Index patch_rows = rhs.patchRows();
-
-          const Index startCol = rhs.colOffset();
-          const Index max_cols = std::min<Index>(ceil_div(peeled_k, patch_rows*patch_depth)+startCol, patch_cols);
-
-          for (Index c = startCol; c < max_cols; ++c) {
-            eigen_assert(k < peeled_k);
-            const Index startRow = (c == startCol) ? rhs.rowOffset() : 0;
-            const Index max_rows = std::min<Index>(ceil_div(peeled_k-c*patch_rows*patch_depth, patch_depth)+startRow, patch_rows);
-
-            const bool pad_col0 = dm0.padCol(c);
-            const bool pad_col1 = dm1.padCol(c);
-            const bool pad_col2 = dm2.padCol(c);
-            const bool pad_col3 = dm3.padCol(c);
-            for (Index r = startRow; r < max_rows; ++r) {
-              eigen_assert(k < peeled_k);
-              const bool pad0 = pad_col0 || dm0.padRow(r);
-              const bool pad1 = pad_col1 || dm1.padRow(r);
-              const bool pad2 = pad_col2 || dm2.padRow(r);
-              const bool pad3 = pad_col3 || dm3.padRow(r);
-
-              const Index idx0 = dm0.baseIndex(r, c);
-              const Index idx1 = dm1.baseIndex(r, c);
-              const Index idx2 = dm2.baseIndex(r, c);
-              const Index idx3 = dm3.baseIndex(r, c);
-
-              const Index startDepth = ((c == startCol) && (r == startRow)) ? rhs.depthOffset() : 0;
-              const Index max_depth = std::min<Index>(peeled_k-c*patch_rows*patch_depth-r*patch_depth+startDepth, patch_depth);
-              eigen_assert(max_depth % packet_size == 0);
-              for (Index d = startDepth; d < max_depth; d += packet_size) {
-                eigen_assert(k < peeled_k);
-                PacketBlock<Packet, 4> kernel;
-                kernel.packet[0] = pad0 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx0);
-                kernel.packet[1] = pad1 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx1);
-                kernel.packet[2] = pad2 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx2);
-                kernel.packet[3] = pad3 ? pset1<Packet>(0) : rhs.packetNoPadding(d, idx3);
-                ptranspose(kernel);
-                pstoreu(block+0*packet_size, kernel.packet[0]);
-                pstoreu(block+1*packet_size, kernel.packet[1]);
-                pstoreu(block+2*packet_size, kernel.packet[2]);
-                pstoreu(block+3*packet_size, kernel.packet[3]);
-                block+=4*packet_size;
-                k += packet_size;
-              }
-            }
-          }
-
-          for(; k<peeled_k; k+=packet_size) {
-            PacketBlock<Packet, 4> kernel;
-            kernel.packet[0] = dm0.loadPacketFast(k);
-            kernel.packet[1] = dm1.loadPacketFast(k);
-            kernel.packet[2] = dm2.loadPacketFast(k);
-            kernel.packet[3] = dm3.loadPacketFast(k);
-            ptranspose(kernel);
-            pstoreu(block+0*packet_size, kernel.packet[0]);
-            pstoreu(block+1*packet_size, kernel.packet[1]);
-            pstoreu(block+2*packet_size, kernel.packet[2]);
-            pstoreu(block+3*packet_size, kernel.packet[3]);
-            block+=4*packet_size;
-          }
-        }
-        else {
-          for(; k<peeled_k; k+=packet_size) {
-            PacketBlock<Packet, 4> kernel;
-            kernel.packet[0] = dm0.loadPacketStandard(k);
-            kernel.packet[1] = dm1.loadPacketStandard(k);
-            kernel.packet[2] = dm2.loadPacketStandard(k);
-            kernel.packet[3] = dm3.loadPacketStandard(k);
-            ptranspose(kernel);
-            pstoreu(block+0*packet_size, kernel.packet[0]);
-            pstoreu(block+1*packet_size, kernel.packet[1]);
-            pstoreu(block+2*packet_size, kernel.packet[2]);
-            pstoreu(block+3*packet_size, kernel.packet[3]);
-            block+=4*packet_size;
-          }
-        }
-      }
-      if (!rhs.nonStandardPatches()) {
-        for(; k<depth; k++)
-        {
-          block[0] = dm0.loadCoeffStandard(k);
-          block[1] = dm1.loadCoeffStandard(k);
-          block[2] = dm2.loadCoeffStandard(k);
-          block[3] = dm3.loadCoeffStandard(k);
-          block += 4;
-        }
-      }
-      else {
-        for(; k<depth; k++)
-        {
-          block[0] = dm0(k);
-          block[1] = dm1(k);
-          block[2] = dm2(k);
-          block[3] = dm3(k);
-          block += 4;
-        }
-      }
-    }
-
-    // copy the remaining columns one at a time (nr==1)
-    for(Index j2=packet_cols4; j2<cols; ++j2)
-    {
-      const SubMapper dm0 = rhs.getLinearMapper(0, j2);
-      for(Index k=0; k<depth; k++)
-      {
-        *block = dm0(k);
-        block += 1;
-      }
-    }
-  }
-};
-
-#endif  // EIGEN_VECTORIZE
-}  // end namespace internal
-
-
-/** SpatialConvolution
-  * \ingroup CXX11_NeuralNetworks_Module
-  *
-  * \brief Applies a 2D convolution over a multichannel input image.
-  *
-  * The input parameter is expected to be a tensor with a rank of 3 or more (channels, height, width, and optionally others)
-  * The kernel parameter is expected to be a 4D tensor (filters, channels, kernel_height, kernel_width)
-  * The input and the kernel must both be in col-major layout. The result will also be in col-major layout.
-  *
-  * If in_stride > 1, then applies convolution with holes (aka atrous convolution), sampling every in_stride input pixels.
-  *
-  * The result can be assigned to a tensor of rank equal to the rank of the input. The dimensions of the result will be filters, height, width (and others if applicable).
-  *
-  * It is possible to swap the order of the width and height dimensions provided that the same order is used in the input, the kernel, and the output.
-  *
-  */
-template <typename Input, typename Kernel>
-EIGEN_ALWAYS_INLINE
-static const typename internal::conditional<
-  internal::traits<Input>::Layout == ColMajor,
-  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> > > >,
-  TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, internal::traits<Input>::NumDimensions>, const TensorContractionOp<const array<IndexPair<typename internal::traits<Input>::Index>, 1>, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const TensorImagePatchOp<Dynamic, Dynamic, const Input> >, const TensorReshapingOp<const DSizes<typename internal::traits<Input>::Index, 2>, const Kernel> > > >::type
-SpatialConvolution(const Input& input, const Kernel& kernel, const DenseIndex stride = 1, const PaddingType padding_type = PADDING_SAME, const DenseIndex in_stride = 1) {
-
-  typedef typename internal::traits<Input>::Index TensorIndex;
-  TensorRef<Tensor<typename internal::traits<Input>::Scalar, internal::traits<Input>::NumDimensions, internal::traits<Input>::Layout, TensorIndex> > in(input);
-  TensorRef<Tensor<typename internal::traits<Kernel>::Scalar, internal::traits<Kernel>::NumDimensions, internal::traits<Kernel>::Layout, TensorIndex> > kern(kernel);
-
-  EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == internal::traits<Kernel>::Layout, YOU_MADE_A_PROGRAMMING_MISTAKE);
-  static const bool isColMajor = (internal::traits<Input>::Layout == ColMajor);
-
-  static const int NumDims = internal::traits<Input>::NumDimensions;
-
-  // Number of filters to apply. This is the same as the output depth of the result
-  const TensorIndex kernelFilters = isColMajor ? kern.dimensions()[0] : kern.dimensions()[3];
-  // Number of channels. This is the same as the input depth.
-  const TensorIndex kernelChannels = isColMajor ? kern.dimensions()[1] : kern.dimensions()[2];
-  const TensorIndex kernelRows = isColMajor ? kern.dimensions()[2] : kern.dimensions()[1];
-  const TensorIndex kernelCols = isColMajor ? kern.dimensions()[3] : kern.dimensions()[0];
-
-  const DenseIndex kernelRowsEff = kernelRows + (kernelRows - 1) * (in_stride - 1);
-  const DenseIndex kernelColsEff = kernelCols + (kernelCols - 1) * (in_stride - 1);
-
-  array<IndexPair<TensorIndex>, 1> contract_dims;
-  contract_dims[0] = IndexPair<TensorIndex>(1, 0);
-
-  const TensorIndex InputRows = isColMajor ? in.dimension(1) : in.dimension(NumDims - 2);
-  const TensorIndex InputCols = isColMajor ? in.dimension(2) : in.dimension(NumDims - 3);
-
-  TensorIndex out_height;
-  TensorIndex out_width;
-  switch (padding_type) {
-    case PADDING_VALID:
-      out_height = numext::ceil((InputRows - kernelRowsEff + 1.f) / static_cast<float>(stride));
-      out_width = numext::ceil((InputCols - kernelColsEff + 1.f) / static_cast<float>(stride));
-      break;
-    case PADDING_SAME:
-      out_height = numext::ceil(InputRows / static_cast<float>(stride));
-      out_width = numext::ceil(InputCols / static_cast<float>(stride));
-      break;
-    default:
-      eigen_assert(false && "unexpected padding");
-  }
-
-  // Molds the output of the patch extraction code into a 2d tensor:
-  // - the first dimension (dims[0]): the patch values to be multiplied with the kernels
-  // - the second dimension (dims[1]): everything else
-  DSizes<TensorIndex, 2> pre_contract_dims;
-  if (isColMajor) {
-    pre_contract_dims[0] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[1] = out_height * out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      pre_contract_dims[1] *= in.dimension(i);
-    }
-  } else {
-    pre_contract_dims[1] = kernelChannels * kernelRows * kernelCols;
-    pre_contract_dims[0] = out_height * out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      pre_contract_dims[0] *= in.dimension(i);
-    }
-  }
-
-  // Molds the output of the contraction into the shape expected by the used
-  // (assuming this is ColMajor):
-  // - 1st dim: kernel filters
-  // - 2nd dim: output height
-  // - 3rd dim: output width
-  // - 4th dim and beyond: everything else including batch size
-  DSizes<TensorIndex, NumDims> post_contract_dims;
-  if (isColMajor) {
-    post_contract_dims[0] = kernelFilters;
-    post_contract_dims[1] = out_height;
-    post_contract_dims[2] = out_width;
-    for (int i = 3; i < NumDims; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  } else {
-    post_contract_dims[NumDims - 1] = kernelFilters;
-    post_contract_dims[NumDims - 2] = out_height;
-    post_contract_dims[NumDims - 3] = out_width;
-    for (int i = 0; i < NumDims - 3; ++i) {
-      post_contract_dims[i] = in.dimension(i);
-    }
-  }
-
-  DSizes<TensorIndex, 2> kernel_dims;
-  if (isColMajor) {
-    kernel_dims[0] = kernelFilters;
-    kernel_dims[1] = kernelChannels * kernelRows * kernelCols;
-  } else {
-    kernel_dims[0] = kernelChannels * kernelRows * kernelCols;
-    kernel_dims[1] = kernelFilters;
-  }
-  // TODO(yangke): choose() is defined in TensorContraction.h -- consider
-  // moving it to somewhere more "common".
-  return choose(Cond<internal::traits<Input>::Layout == ColMajor>(),
-                kernel.reshape(kernel_dims).contract(input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims), contract_dims).reshape(post_contract_dims),
-                input.extract_image_patches(kernelRows, kernelCols, stride, stride, in_stride, in_stride, padding_type).reshape(pre_contract_dims).contract(kernel.reshape(kernel_dims), contract_dims).reshape(post_contract_dims));
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_NEURAL_NETWORKS_SPATIAL_CONVOLUTIONS_H
diff --git a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h b/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
deleted file mode 100644
index 0e7217353644acd1c085f4f661dbe62fc06e6088..0000000000000000000000000000000000000000
--- a/third_party/eigen3/unsupported/Eigen/CXX11/src/NeuralNetworks/TensorConvolutionByFFT.h
+++ /dev/null
@@ -1,289 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
-
-namespace Eigen {
-
-/** \class TensorConvolutionByFFT
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor convolution class.
-  *
-  *
-  */
-namespace internal {
-
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct traits<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >
-{
-  // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename promote_storage_type<typename InputXprType::Scalar,
-                                        typename KernelXprType::Scalar>::ret Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
-                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<InputXprType>::Index,
-                                      typename traits<KernelXprType>::Index>::type Index;
-  typedef typename InputXprType::Nested LhsNested;
-  typedef typename KernelXprType::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
-  static const int NumDimensions = traits<InputXprType>::NumDimensions;
-  static const int Layout = traits<InputXprType>::Layout;
-
-  enum {
-    Flags = 0,
-  };
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
-{
-  typedef const TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>& type;
-};
-
-template<typename Dimensions, typename InputXprType, typename KernelXprType>
-struct nested<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> >::type>
-{
-  typedef TensorConvolutionByFFTOp<Dimensions, InputXprType, KernelXprType> type;
-};
-
-}  // end namespace internal
-
-
-
-template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionByFFTOp : public TensorBase<TensorConvolutionByFFTOp<Indices, InputXprType, KernelXprType> >
-{
-  public:
-  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Scalar Scalar;
-  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Packet Packet;
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
-                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
-  typedef typename internal::promote_storage_type<typename InputXprType::PacketReturnType,
-                                                  typename KernelXprType::PacketReturnType>::ret PacketReturnType;
-  typedef typename Eigen::internal::nested<TensorConvolutionByFFTOp>::type Nested;
-  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::StorageKind StorageKind;
-  typedef typename Eigen::internal::traits<TensorConvolutionByFFTOp>::Index Index;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionByFFTOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
-      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Indices& indices() const { return m_indices; }
-
-    /** \returns the nested expressions */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename InputXprType::Nested>::type&
-    inputExpression() const { return m_input_xpr; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename KernelXprType::Nested>::type&
-    kernelExpression() const { return m_kernel_xpr; }
-
-  protected:
-    typename InputXprType::Nested m_input_xpr;
-    typename KernelXprType::Nested m_kernel_xpr;
-    const Indices m_indices;
-};
-
-
-template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
-struct TensorEvaluator<const TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType>, Device>
-{
-  typedef TensorConvolutionByFFTOp<Indices, InputArgType, KernelArgType> XprType;
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename XprType::PacketReturnType PacketReturnType;
-
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-
-  static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned &
-                TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = false,
-    BlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
-  {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_inputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
-      }
-    } else {
-      m_inputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
-      }
-    }
-
-    m_dimensions = m_inputImpl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < NumKernelDims; ++i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i > 0) {
-          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
-        } else {
-          m_kernelStride[0] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[0] = 1;
-      for (int i = 1; i < NumDims; ++i) {
-        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
-      }
-    } else {
-      for (int i = NumKernelDims - 1; i >= 0; --i) {
-        const Index index = op.indices()[i];
-        const Index input_dim = input_dims[index];
-        const Index kernel_dim = kernel_dims[i];
-        const Index result_dim = input_dim - kernel_dim + 1;
-        m_dimensions[index] = result_dim;
-        if (i < NumKernelDims - 1) {
-          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
-        } else {
-          m_kernelStride[NumKernelDims - 1] = 1;
-        }
-        m_indexStride[i] = m_inputStride[index];
-      }
-
-      m_outputStride[NumDims - 1] = 1;
-      for (int i = NumDims - 2; i >= 0; --i) {
-        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
-
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    m_kernelImpl.evalSubExprsIfNeeded(NULL);
-
-    typedef typename internal::traits<InputArgType>::Index TensorIndex;
-
-    Tensor<Scalar, NumDims, Layout, TensorIndex> input(m_inputImpl.dimensions());
-    for (int i = 0; i < m_inputImpl.dimensions().TotalSize(); ++i) {
-      input.data()[i] = m_inputImpl.coeff(i);
-    }
-
-    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel(m_kernelImpl.dimensions());
-    for (int i = 0; i < m_kernelImpl.dimensions().TotalSize(); ++i) {
-      kernel.data()[i] = m_kernelImpl.coeff(i);
-    }
-
-    array<std::pair<ptrdiff_t, ptrdiff_t>, NumDims> paddings;
-    for (int i = 0; i < NumDims; ++i) {
-      paddings[i] = std::make_pair(0, m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i]);
-    }
-
-    Eigen::array<bool, NumKernelDims> reverse;
-    for (int i = 0; i < NumKernelDims; ++i) {
-      reverse[i] = true;
-    }
-
-    Eigen::array<bool, NumDims> fft;
-    for (int i = 0; i < NumDims; ++i) {
-      fft[i] = i;
-    }
-
-    Eigen::DSizes<TensorIndex, NumDims> slice_offsets;
-    for (int i = 0; i < NumDims; ++i) {
-      slice_offsets[i] = m_kernelImpl.dimensions()[i] - 1;
-    }
-
-    Eigen::DSizes<TensorIndex, NumDims> slice_extents;
-    for (int i = 0; i < NumDims; ++i) {
-      slice_extents[i] = m_inputImpl.dimensions()[i] - m_kernelImpl.dimensions()[i] + 1;
-    }
-
-    Tensor<Scalar, NumDims, Layout, TensorIndex> kernel_variant =  kernel.reverse(reverse).pad(paddings);
-    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> kernel_fft =  kernel_variant.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
-    //Tensor<std::complex<Scalar>, NumDims, Layout|IndexType> kernel_fft =  kernel.reverse(reverse).pad(paddings).template fft<2>(fft);
-    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> input_fft = input.template fft<Eigen::BothParts, FFT_FORWARD>(fft);
-    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> prod = (input_fft * kernel_fft).template fft<Eigen::BothParts, FFT_REVERSE>(fft);
-    Tensor<std::complex<Scalar>, NumDims, Layout, TensorIndex> tensor_result = prod.slice(slice_offsets, slice_extents);
-
-    for (int i = 0; i < tensor_result.size(); ++i) {
-      data[i] = std::real(tensor_result.data()[i]);
-    }
-    return false;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_local_kernel) {
-      m_device.deallocate((void*)m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-
-  void evalTo(typename XprType::Scalar* buffer) {
-    evalSubExprsIfNeeded(NULL);
-    for (int i = 0; i < dimensions().TotalSize(); ++i) {
-      buffer[i] += coeff(i);
-    }
-    cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    CoeffReturnType result = CoeffReturnType(0);
-    return result;
-  }
-
-  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
-
- private:
-  array<Index, NumDims> m_inputStride;
-  array<Index, NumDims> m_outputStride;
-
-  array<Index, NumKernelDims> m_indexStride;
-  array<Index, NumKernelDims> m_kernelStride;
-  TensorEvaluator<InputArgType, Device> m_inputImpl;
-  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
-  Dimensions m_dimensions;
-
-  KernelArgType m_kernelArg;
-  const Scalar* m_kernel;
-  bool m_local_kernel;
-  const Device& m_device;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTIONBYFFT_H
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
index c242ef3fddc04069b2d6ca14e622ec915d08b023..de63ebe9e67d37dcc0ecf309edf1fae89169af5f 100644
--- a/third_party/examples/eager/spinn/spinn.py
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -626,7 +626,7 @@ def train_or_infer_spinn(embed,
     model = SNLIClassifier(config, embed)
     global_step = tf.train.get_or_create_global_step()
     trainer = SNLIClassifierTrainer(model, config.lr)
-    checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step)
+    checkpoint = tf.train.Checkpoint(trainer=trainer, global_step=global_step)
     checkpoint.restore(tf.train.latest_checkpoint(config.logdir))
 
     if inference_sentence_pair:
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index fc3183a754369fc30dbce40c2bf7b6828ea497c3..ec1006fe23567983785be7b8f15a3f44dcb47900 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -17,6 +17,6 @@ platform(
     remote_execution_properties = """
         properties: {
             name: "container-image"
-            value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:800a7b68cabef15419695c188ed33ed70adf678c2371b97b236f3ae26c38274d"
+            value:"docker://gcr.io/asci-toolchain/nosla-ubuntu16_04-tf@sha256:495a025ed5e273cfa5d53357ef93ac20500c008994e0be106c509f51555fb93c"
         }""",
 )
diff --git a/third_party/toolchains/cpus/py/BUILD b/third_party/toolchains/cpus/py/BUILD
index c175742cbfe918e55035e89b7454596acd43307e..1235988abb7fa9982b26f470b52b88d40b989c26 100644
--- a/third_party/toolchains/cpus/py/BUILD
+++ b/third_party/toolchains/cpus/py/BUILD
@@ -6,18 +6,24 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
 cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
-    data = select({
-        ":windows": [":python_import_lib"],
-        "//conditions:default": [],
-    }),
     includes = ["python_include"],
-    linkopts = select({
-        # TODO(pcloudy): Ideally, this should just go into deps after resolving
-        # https://github.com/bazelbuild/bazel/issues/3237,
-        ":windows": ["$(locations :python_import_lib)"],
+    deps = select({
+        ":windows": [":python_lib"],
         "//conditions:default": [],
     }),
 )
@@ -37,161 +43,135 @@ config_setting(
 genrule(
     name = "python_include",
     outs = [
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
+        "python_include/bitset.h",
+        "python_include/boolobject.h",
+        "python_include/bufferobject.h",
+        "python_include/bytearrayobject.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cStringIO.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
+        "python_include/classobject.h",
+        "python_include/cobject.h",
         "python_include/code.h",
+        "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
         "python_include/dtoa.h",
-        "python_include/tupleobject.h",
-        "python_include/object.h",
-        "python_include/ast.h",
-        "python_include/pymacconfig.h",
+        "python_include/enumobject.h",
         "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/floatobject.h",
         "python_include/frameobject.h",
-        "python_include/pgenheaders.h",
-        "python_include/cellobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
+        "python_include/graminit.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
         "python_include/intobject.h",
-        "python_include/pythread.h",
-        "python_include/cStringIO.h",
-        "python_include/boolobject.h",
+        "python_include/intrcheck.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
         "python_include/modsupport.h",
-        "python_include/import.h",
-        "python_include/pymath.h",
+        "python_include/moduleobject.h",
         "python_include/node.h",
-        "python_include/funcobject.h",
-        "python_include/eval.h",
-        "python_include/longintrepr.h",
-        "python_include/floatobject.h",
-        "python_include/rangeobject.h",
-        "python_include/pyfpe.h",
-        "python_include/pystrcmp.h",
-        "python_include/dictobject.h",
-        "python_include/pyarena.h",
+        "python_include/object.h",
         "python_include/objimpl.h",
-        "python_include/bitset.h",
-        "python_include/memoryobject.h",
-        "python_include/bytearrayobject.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
         "python_include/pydebug.h",
         "python_include/pyerrors.h",
-        "python_include/weakrefobject.h",
-        "python_include/grammar.h",
-        "python_include/symtable.h",
-        "python_include/longobject.h",
-        "python_include/structmember.h",
-        "python_include/enumobject.h",
-        "python_include/classobject.h",
-        "python_include/unicodeobject.h",
-        "python_include/sliceobject.h",
-        "python_include/pystrtod.h",
-        "python_include/genobject.h",
-        "python_include/pymactoolbox.h",
-        "python_include/compile.h",
         "python_include/pyexpat.h",
-        "python_include/asdl.h",
-        "python_include/codecs.h",
-        "python_include/pyctype.h",
-        "python_include/sysmodule.h",
-        "python_include/methodobject.h",
-        "python_include/graminit.h",
-        "python_include/cobject.h",
-        "python_include/intrcheck.h",
-        "python_include/pyport.h",
-        "python_include/warnings.h",
-        "python_include/osdefs.h",
-        "python_include/fileobject.h",
-        "python_include/stringobject.h",
-        "python_include/timefuncs.h",
-        "python_include/traceback.h",
-        "python_include/ceval.h",
-        "python_include/bytes_methods.h",
-        "python_include/pyconfig.h",
-        "python_include/Python.h",
-        "python_include/moduleobject.h",
-        "python_include/pystate.h",
-        "python_include/descrobject.h",
-        "python_include/ucnhash.h",
+        "python_include/pyfpe.h",
         "python_include/pygetopt.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymactoolbox.h",
+        "python_include/pymath.h",
         "python_include/pymem.h",
-        "python_include/complexobject.h",
-        "python_include/structseq.h",
-        "python_include/datetime.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrtod.h",
         "python_include/pythonrun.h",
-        "python_include/numpy/oldnumeric.h",
-        "python_include/numpy/npy_1_7_deprecated_api.h",
-        "python_include/numpy/ufunc_api.txt",
-        "python_include/numpy/multiarray_api.txt",
-        "python_include/numpy/halffloat.h",
-        "python_include/numpy/npy_common.h",
-        "python_include/numpy/utils.h",
-        "python_include/numpy/npy_interrupt.h",
-        "python_include/numpy/npy_endian.h",
-        "python_include/numpy/__ufunc_api.h",
-        "python_include/numpy/_neighborhood_iterator_imp.h",
-        "python_include/numpy/ufuncobject.h",
-        "python_include/numpy/ndarraytypes.h",
-        "python_include/numpy/npy_math.h",
-        "python_include/numpy/noprefix.h",
-        "python_include/numpy/npy_3kcompat.h",
-        "python_include/numpy/arrayscalars.h",
-        "python_include/numpy/npy_os.h",
-        "python_include/numpy/ndarrayobject.h",
-        "python_include/numpy/npy_no_deprecated_api.h",
-        "python_include/numpy/arrayobject.h",
-        "python_include/numpy/_numpyconfig.h",
-        "python_include/numpy/__multiarray_api.h",
-        "python_include/numpy/npy_cpu.h",
-        "python_include/numpy/old_defines.h",
-        "python_include/numpy/numpyconfig.h",
-        "python_include/pycapsule.h",
+        "python_include/pythread.h",
+        "python_include/rangeobject.h",
         "python_include/setobject.h",
-        "python_include/listobject.h",
-        "python_include/bytesobject.h",
-        "python_include/pgen.h",
-        "python_include/patchlevel.h",
-        "python_include/opcode.h",
-        "python_include/parsetok.h",
-        "python_include/marshal.h",
+        "python_include/sliceobject.h",
+        "python_include/stringobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
+        "python_include/timefuncs.h",
         "python_include/token.h",
-        "python_include/iterobject.h",
-        "python_include/abstract.h",
-        "python_include/py_curses.h",
-        "python_include/metagrammar.h",
-        "python_include/bufferobject.h",
-        "python_include/Python-ast.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp "/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp "/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h" && cp "/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp "/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp "/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp "/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp "/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python2.7/numpy/oldnumeric.h" "$(@D)/python_include/numpy/oldnumeric.h" && cp "/usr/include/python2.7/numpy/npy_1_7_deprecated_api.h" "$(@D)/python_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/include/python2.7/numpy/ufunc_api.txt" "$(@D)/python_include/numpy/ufunc_api.txt" && cp "/usr/include/python2.7/numpy/multiarray_api.txt" "$(@D)/python_include/numpy/multiarray_api.txt" && cp "/usr/include/python2.7/numpy/halffloat.h" "$(@D)/python_include/numpy/halffloat.h" && cp "/usr/include/python2.7/numpy/npy_common.h" "$(@D)/python_include/numpy/npy_common.h" && cp "/usr/include/python2.7/numpy/utils.h" "$(@D)/python_include/numpy/utils.h" && cp "/usr/include/python2.7/numpy/npy_interrupt.h" "$(@D)/python_include/numpy/npy_interrupt.h" && cp "/usr/include/python2.7/numpy/npy_endian.h" "$(@D)/python_include/numpy/npy_endian.h" && cp "/usr/include/python2.7/numpy/__ufunc_api.h" "$(@D)/python_include/numpy/__ufunc_api.h" && cp "/usr/include/python2.7/numpy/_neighborhood_iterator_imp.h" "$(@D)/python_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/include/python2.7/numpy/ufuncobject.h" "$(@D)/python_include/numpy/ufuncobject.h" && cp "/usr/include/python2.7/numpy/ndarraytypes.h" "$(@D)/python_include/numpy/ndarraytypes.h" && cp "/usr/include/python2.7/numpy/npy_math.h" "$(@D)/python_include/numpy/npy_math.h" && cp "/usr/include/python2.7/numpy/noprefix.h" "$(@D)/python_include/numpy/noprefix.h" && cp "/usr/include/python2.7/numpy/npy_3kcompat.h" "$(@D)/python_include/numpy/npy_3kcompat.h" && cp "/usr/include/python2.7/numpy/arrayscalars.h" "$(@D)/python_include/numpy/arrayscalars.h" && cp "/usr/include/python2.7/numpy/npy_os.h" "$(@D)/python_include/numpy/npy_os.h" && cp "/usr/include/python2.7/numpy/ndarrayobject.h" "$(@D)/python_include/numpy/ndarrayobject.h" && cp "/usr/include/python2.7/numpy/npy_no_deprecated_api.h" "$(@D)/python_include/numpy/npy_no_deprecated_api.h" && cp "/usr/include/python2.7/numpy/arrayobject.h" "$(@D)/python_include/numpy/arrayobject.h" && cp "/usr/include/python2.7/numpy/_numpyconfig.h" "$(@D)/python_include/numpy/_numpyconfig.h" && cp "/usr/include/python2.7/numpy/__multiarray_api.h" "$(@D)/python_include/numpy/__multiarray_api.h" && cp "/usr/include/python2.7/numpy/npy_cpu.h" "$(@D)/python_include/numpy/npy_cpu.h" && cp "/usr/include/python2.7/numpy/old_defines.h" "$(@D)/python_include/numpy/old_defines.h" && cp "/usr/include/python2.7/numpy/numpyconfig.h" "$(@D)/python_include/numpy/numpyconfig.h" && cp "/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp "/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h"
+cp "/usr/include/python2.7/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/usr/include/python2.7/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python2.7/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python2.7/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python2.7/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python2.7/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python2.7/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python2.7/bufferobject.h" "$(@D)/python_include/bufferobject.h" && cp "/usr/include/python2.7/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python2.7/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python2.7/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python2.7/cStringIO.h" "$(@D)/python_include/cStringIO.h" && cp "/usr/include/python2.7/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python2.7/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python2.7/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python2.7/cobject.h" "$(@D)/python_include/cobject.h" && cp "/usr/include/python2.7/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python2.7/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python2.7/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python2.7/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python2.7/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python2.7/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python2.7/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python2.7/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python2.7/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python2.7/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python2.7/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python2.7/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python2.7/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python2.7/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python2.7/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python2.7/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python2.7/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python2.7/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python2.7/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python2.7/intobject.h" "$(@D)/python_include/intobject.h" && cp "/usr/include/python2.7/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python2.7/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python2.7/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python2.7/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python2.7/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python2.7/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python2.7/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python2.7/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python2.7/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python2.7/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python2.7/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python2.7/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python2.7/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python2.7/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python2.7/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python2.7/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python2.7/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python2.7/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python2.7/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python2.7/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python2.7/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python2.7/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python2.7/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python2.7/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python2.7/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python2.7/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python2.7/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python2.7/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python2.7/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python2.7/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python2.7/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python2.7/pymactoolbox.h" "$(@D)/python_include/pymactoolbox.h" && cp "/usr/include/python2.7/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python2.7/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python2.7/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python2.7/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python2.7/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python2.7/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python2.7/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python2.7/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python2.7/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python2.7/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python2.7/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python2.7/stringobject.h" "$(@D)/python_include/stringobject.h" && cp "/usr/include/python2.7/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python2.7/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python2.7/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python2.7/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python2.7/timefuncs.h" "$(@D)/python_include/timefuncs.h" && cp "/usr/include/python2.7/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python2.7/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python2.7/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python2.7/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python2.7/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python2.7/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python2.7/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
 genrule(
     name = "numpy_include",
     outs = [
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/utils.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/__multiarray_api.h",
         "numpy_include/numpy/__ufunc_api.h",
         "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
         "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/npy_math.h",
         "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
         "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/npy_common.h",
         "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
         "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/lib/python2.7/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h"
+cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )
diff --git a/third_party/toolchains/cpus/py3/BUILD b/third_party/toolchains/cpus/py3/BUILD
index 932a25239fb5f7e35c2ada46b70309e6635bcb4a..d47256ebef88fa39d904c9815ce4295e5c693ffa 100644
--- a/third_party/toolchains/cpus/py3/BUILD
+++ b/third_party/toolchains/cpus/py3/BUILD
@@ -6,18 +6,24 @@ licenses(["restricted"])
 
 package(default_visibility = ["//visibility:public"])
 
+# To build Python C/C++ extension on Windows, we need to link to python import library pythonXY.lib
+# See https://docs.python.org/3/extending/windows.html
+cc_import(
+    name = "python_lib",
+    interface_library = select({
+        ":windows": ":python_import_lib",
+        # A placeholder for Unix platforms which makes --no_build happy.
+        "//conditions:default": "not-existing.lib",
+    }),
+    system_provided = 1,
+)
+
 cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
-    data = select({
-        ":windows": [":python_import_lib"],
-        "//conditions:default": [],
-    }),
     includes = ["python_include"],
-    linkopts = select({
-        # TODO(pcloudy): Ideally, this should just go into deps after resolving
-        # https://github.com/bazelbuild/bazel/issues/3237,
-        ":windows": ["$(locations :python_import_lib)"],
+    deps = select({
+        ":windows": [":python_lib"],
         "//conditions:default": [],
     }),
 )
@@ -37,143 +43,143 @@ config_setting(
 genrule(
     name = "python_include",
     outs = [
-        "python_include/code.h",
-        "python_include/dtoa.h",
-        "python_include/tupleobject.h",
-        "python_include/object.h",
-        "python_include/ast.h",
-        "python_include/pymacconfig.h",
-        "python_include/errcode.h",
-        "python_include/frameobject.h",
-        "python_include/typeslots.h",
-        "python_include/pgenheaders.h",
-        "python_include/cellobject.h",
-        "python_include/pythread.h",
-        "python_include/boolobject.h",
+        "python_include/Python-ast.h",
+        "python_include/Python.h",
+        "python_include/abstract.h",
         "python_include/accu.h",
-        "python_include/modsupport.h",
-        "python_include/import.h",
-        "python_include/pymath.h",
-        "python_include/node.h",
-        "python_include/funcobject.h",
-        "python_include/eval.h",
-        "python_include/pyatomic.h",
-        "python_include/longintrepr.h",
-        "python_include/floatobject.h",
-        "python_include/rangeobject.h",
-        "python_include/pyfpe.h",
-        "python_include/pystrcmp.h",
-        "python_include/fileutils.h",
-        "python_include/dictobject.h",
-        "python_include/pyarena.h",
-        "python_include/osmodule.h",
-        "python_include/objimpl.h",
+        "python_include/asdl.h",
+        "python_include/ast.h",
         "python_include/bitset.h",
-        "python_include/memoryobject.h",
+        "python_include/bltinmodule.h",
+        "python_include/boolobject.h",
         "python_include/bytearrayobject.h",
-        "python_include/pydebug.h",
-        "python_include/pyerrors.h",
-        "python_include/weakrefobject.h",
-        "python_include/grammar.h",
-        "python_include/symtable.h",
-        "python_include/longobject.h",
-        "python_include/structmember.h",
-        "python_include/enumobject.h",
-        "python_include/pymacro.h",
+        "python_include/bytes_methods.h",
+        "python_include/bytesobject.h",
+        "python_include/cellobject.h",
+        "python_include/ceval.h",
         "python_include/classobject.h",
-        "python_include/unicodeobject.h",
-        "python_include/sliceobject.h",
-        "python_include/pystrtod.h",
-        "python_include/genobject.h",
-        "python_include/compile.h",
-        "python_include/pyexpat.h",
-        "python_include/asdl.h",
+        "python_include/code.h",
         "python_include/codecs.h",
+        "python_include/compile.h",
+        "python_include/complexobject.h",
+        "python_include/datetime.h",
+        "python_include/descrobject.h",
+        "python_include/dictobject.h",
+        "python_include/dtoa.h",
         "python_include/dynamic_annotations.h",
-        "python_include/pyctype.h",
-        "python_include/sysmodule.h",
-        "python_include/methodobject.h",
+        "python_include/enumobject.h",
+        "python_include/errcode.h",
+        "python_include/eval.h",
+        "python_include/fileobject.h",
+        "python_include/fileutils.h",
+        "python_include/floatobject.h",
+        "python_include/frameobject.h",
+        "python_include/funcobject.h",
+        "python_include/genobject.h",
         "python_include/graminit.h",
-        "python_include/bltinmodule.h",
+        "python_include/grammar.h",
+        "python_include/import.h",
         "python_include/intrcheck.h",
-        "python_include/pyport.h",
-        "python_include/warnings.h",
-        "python_include/osdefs.h",
-        "python_include/pydtrace.h",
-        "python_include/pylifecycle.h",
-        "python_include/fileobject.h",
-        "python_include/pytime.h",
-        "python_include/traceback.h",
-        "python_include/ceval.h",
-        "python_include/bytes_methods.h",
-        "python_include/namespaceobject.h",
-        "python_include/pyconfig.h",
-        "python_include/Python.h",
+        "python_include/iterobject.h",
+        "python_include/listobject.h",
+        "python_include/longintrepr.h",
+        "python_include/longobject.h",
+        "python_include/marshal.h",
+        "python_include/memoryobject.h",
+        "python_include/metagrammar.h",
+        "python_include/methodobject.h",
+        "python_include/modsupport.h",
         "python_include/moduleobject.h",
-        "python_include/pystate.h",
-        "python_include/descrobject.h",
+        "python_include/namespaceobject.h",
+        "python_include/node.h",
+        "python_include/object.h",
+        "python_include/objimpl.h",
         "python_include/odictobject.h",
-        "python_include/ucnhash.h",
+        "python_include/opcode.h",
+        "python_include/osdefs.h",
+        "python_include/osmodule.h",
+        "python_include/parsetok.h",
+        "python_include/patchlevel.h",
+        "python_include/pgen.h",
+        "python_include/pgenheaders.h",
+        "python_include/py_curses.h",
+        "python_include/pyarena.h",
+        "python_include/pyatomic.h",
+        "python_include/pycapsule.h",
+        "python_include/pyconfig.h",
+        "python_include/pyctype.h",
+        "python_include/pydebug.h",
+        "python_include/pydtrace.h",
+        "python_include/pyerrors.h",
+        "python_include/pyexpat.h",
+        "python_include/pyfpe.h",
         "python_include/pygetopt.h",
+        "python_include/pyhash.h",
+        "python_include/pylifecycle.h",
+        "python_include/pymacconfig.h",
+        "python_include/pymacro.h",
+        "python_include/pymath.h",
         "python_include/pymem.h",
-        "python_include/complexobject.h",
-        "python_include/structseq.h",
-        "python_include/datetime.h",
+        "python_include/pyport.h",
+        "python_include/pystate.h",
+        "python_include/pystrcmp.h",
+        "python_include/pystrhex.h",
+        "python_include/pystrtod.h",
         "python_include/pythonrun.h",
-        "python_include/pyhash.h",
-        "python_include/pycapsule.h",
+        "python_include/pythread.h",
+        "python_include/pytime.h",
+        "python_include/rangeobject.h",
         "python_include/setobject.h",
-        "python_include/listobject.h",
-        "python_include/bytesobject.h",
-        "python_include/pgen.h",
-        "python_include/patchlevel.h",
-        "python_include/opcode.h",
-        "python_include/parsetok.h",
-        "python_include/pystrhex.h",
-        "python_include/marshal.h",
+        "python_include/sliceobject.h",
+        "python_include/structmember.h",
+        "python_include/structseq.h",
+        "python_include/symtable.h",
+        "python_include/sysmodule.h",
         "python_include/token.h",
-        "python_include/iterobject.h",
-        "python_include/abstract.h",
-        "python_include/py_curses.h",
-        "python_include/metagrammar.h",
-        "python_include/Python-ast.h",
+        "python_include/traceback.h",
+        "python_include/tupleobject.h",
+        "python_include/typeslots.h",
+        "python_include/ucnhash.h",
+        "python_include/unicodeobject.h",
+        "python_include/warnings.h",
+        "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/opt/python3.6/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp "/opt/python3.6/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/opt/python3.6/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/opt/python3.6/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp "/opt/python3.6/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp "/opt/python3.6/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/opt/python3.6/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/opt/python3.6/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/opt/python3.6/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/opt/python3.6/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/opt/python3.6/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/opt/python3.6/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/opt/python3.6/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/opt/python3.6/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp "/opt/python3.6/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/opt/python3.6/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp "/opt/python3.6/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/opt/python3.6/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp "/opt/python3.6/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/opt/python3.6/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp "/opt/python3.6/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/opt/python3.6/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/opt/python3.6/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/opt/python3.6/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/opt/python3.6/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/opt/python3.6/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/opt/python3.6/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/opt/python3.6/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/opt/python3.6/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/opt/python3.6/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp "/opt/python3.6/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/opt/python3.6/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/opt/python3.6/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/opt/python3.6/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/opt/python3.6/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/opt/python3.6/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/opt/python3.6/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h" && cp "/opt/python3.6/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/opt/python3.6/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/opt/python3.6/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/opt/python3.6/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/opt/python3.6/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/opt/python3.6/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/opt/python3.6/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/opt/python3.6/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/opt/python3.6/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/opt/python3.6/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/opt/python3.6/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/opt/python3.6/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp "/opt/python3.6/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/opt/python3.6/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/opt/python3.6/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/opt/python3.6/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/opt/python3.6/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/opt/python3.6/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/opt/python3.6/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/opt/python3.6/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/opt/python3.6/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/opt/python3.6/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/opt/python3.6/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/opt/python3.6/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/opt/python3.6/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/opt/python3.6/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp "/opt/python3.6/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp "/opt/python3.6/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/opt/python3.6/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/opt/python3.6/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/opt/python3.6/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/opt/python3.6/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/opt/python3.6/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/opt/python3.6/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/opt/python3.6/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp "/opt/python3.6/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/opt/python3.6/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/opt/python3.6/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/opt/python3.6/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp "/opt/python3.6/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/opt/python3.6/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/opt/python3.6/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/opt/python3.6/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/opt/python3.6/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/opt/python3.6/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/opt/python3.6/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/opt/python3.6/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/opt/python3.6/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/opt/python3.6/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/opt/python3.6/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/opt/python3.6/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/opt/python3.6/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/opt/python3.6/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/opt/python3.6/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/opt/python3.6/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/opt/python3.6/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp "/opt/python3.6/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/opt/python3.6/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp "/opt/python3.6/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/opt/python3.6/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/opt/python3.6/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/opt/python3.6/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/opt/python3.6/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h"
+cp "/opt/python3.6/include/python3.6m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/opt/python3.6/include/python3.6m/Python.h" "$(@D)/python_include/Python.h" && cp "/opt/python3.6/include/python3.6m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/opt/python3.6/include/python3.6m/accu.h" "$(@D)/python_include/accu.h" && cp "/opt/python3.6/include/python3.6m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/opt/python3.6/include/python3.6m/ast.h" "$(@D)/python_include/ast.h" && cp "/opt/python3.6/include/python3.6m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/opt/python3.6/include/python3.6m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/opt/python3.6/include/python3.6m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/opt/python3.6/include/python3.6m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/opt/python3.6/include/python3.6m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/opt/python3.6/include/python3.6m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/opt/python3.6/include/python3.6m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/opt/python3.6/include/python3.6m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/opt/python3.6/include/python3.6m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/opt/python3.6/include/python3.6m/code.h" "$(@D)/python_include/code.h" && cp "/opt/python3.6/include/python3.6m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/opt/python3.6/include/python3.6m/compile.h" "$(@D)/python_include/compile.h" && cp "/opt/python3.6/include/python3.6m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/opt/python3.6/include/python3.6m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/opt/python3.6/include/python3.6m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/opt/python3.6/include/python3.6m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/opt/python3.6/include/python3.6m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/opt/python3.6/include/python3.6m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/opt/python3.6/include/python3.6m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/opt/python3.6/include/python3.6m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/opt/python3.6/include/python3.6m/eval.h" "$(@D)/python_include/eval.h" && cp "/opt/python3.6/include/python3.6m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/opt/python3.6/include/python3.6m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/opt/python3.6/include/python3.6m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/opt/python3.6/include/python3.6m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/opt/python3.6/include/python3.6m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/opt/python3.6/include/python3.6m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/opt/python3.6/include/python3.6m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/opt/python3.6/include/python3.6m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/opt/python3.6/include/python3.6m/import.h" "$(@D)/python_include/import.h" && cp "/opt/python3.6/include/python3.6m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/opt/python3.6/include/python3.6m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/opt/python3.6/include/python3.6m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/opt/python3.6/include/python3.6m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/opt/python3.6/include/python3.6m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/opt/python3.6/include/python3.6m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/opt/python3.6/include/python3.6m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/opt/python3.6/include/python3.6m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/opt/python3.6/include/python3.6m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/opt/python3.6/include/python3.6m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/opt/python3.6/include/python3.6m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/opt/python3.6/include/python3.6m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/opt/python3.6/include/python3.6m/node.h" "$(@D)/python_include/node.h" && cp "/opt/python3.6/include/python3.6m/object.h" "$(@D)/python_include/object.h" && cp "/opt/python3.6/include/python3.6m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/opt/python3.6/include/python3.6m/odictobject.h" "$(@D)/python_include/odictobject.h" && cp "/opt/python3.6/include/python3.6m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/opt/python3.6/include/python3.6m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/opt/python3.6/include/python3.6m/osmodule.h" "$(@D)/python_include/osmodule.h" && cp "/opt/python3.6/include/python3.6m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/opt/python3.6/include/python3.6m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/opt/python3.6/include/python3.6m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/opt/python3.6/include/python3.6m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/opt/python3.6/include/python3.6m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/opt/python3.6/include/python3.6m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/opt/python3.6/include/python3.6m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/opt/python3.6/include/python3.6m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/opt/python3.6/include/python3.6m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/opt/python3.6/include/python3.6m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/opt/python3.6/include/python3.6m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/opt/python3.6/include/python3.6m/pydtrace.h" "$(@D)/python_include/pydtrace.h" && cp "/opt/python3.6/include/python3.6m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/opt/python3.6/include/python3.6m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/opt/python3.6/include/python3.6m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/opt/python3.6/include/python3.6m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/opt/python3.6/include/python3.6m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/opt/python3.6/include/python3.6m/pylifecycle.h" "$(@D)/python_include/pylifecycle.h" && cp "/opt/python3.6/include/python3.6m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/opt/python3.6/include/python3.6m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/opt/python3.6/include/python3.6m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/opt/python3.6/include/python3.6m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/opt/python3.6/include/python3.6m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/opt/python3.6/include/python3.6m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/opt/python3.6/include/python3.6m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/opt/python3.6/include/python3.6m/pystrhex.h" "$(@D)/python_include/pystrhex.h" && cp "/opt/python3.6/include/python3.6m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/opt/python3.6/include/python3.6m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/opt/python3.6/include/python3.6m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/opt/python3.6/include/python3.6m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/opt/python3.6/include/python3.6m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/opt/python3.6/include/python3.6m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/opt/python3.6/include/python3.6m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/opt/python3.6/include/python3.6m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/opt/python3.6/include/python3.6m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/opt/python3.6/include/python3.6m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/opt/python3.6/include/python3.6m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/opt/python3.6/include/python3.6m/token.h" "$(@D)/python_include/token.h" && cp "/opt/python3.6/include/python3.6m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/opt/python3.6/include/python3.6m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/opt/python3.6/include/python3.6m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/opt/python3.6/include/python3.6m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/opt/python3.6/include/python3.6m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/opt/python3.6/include/python3.6m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/opt/python3.6/include/python3.6m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
 genrule(
     name = "numpy_include",
     outs = [
-        "numpy_include/numpy/oldnumeric.h",
-        "numpy_include/numpy/npy_1_7_deprecated_api.h",
-        "numpy_include/numpy/ufunc_api.txt",
-        "numpy_include/numpy/multiarray_api.txt",
-        "numpy_include/numpy/halffloat.h",
-        "numpy_include/numpy/npy_common.h",
-        "numpy_include/numpy/utils.h",
-        "numpy_include/numpy/npy_interrupt.h",
-        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/__multiarray_api.h",
         "numpy_include/numpy/__ufunc_api.h",
         "numpy_include/numpy/_neighborhood_iterator_imp.h",
-        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/_numpyconfig.h",
+        "numpy_include/numpy/arrayobject.h",
+        "numpy_include/numpy/arrayscalars.h",
+        "numpy_include/numpy/halffloat.h",
+        "numpy_include/numpy/multiarray_api.txt",
+        "numpy_include/numpy/ndarrayobject.h",
         "numpy_include/numpy/ndarraytypes.h",
-        "numpy_include/numpy/npy_math.h",
         "numpy_include/numpy/noprefix.h",
+        "numpy_include/numpy/npy_1_7_deprecated_api.h",
         "numpy_include/numpy/npy_3kcompat.h",
-        "numpy_include/numpy/arrayscalars.h",
-        "numpy_include/numpy/npy_os.h",
-        "numpy_include/numpy/ndarrayobject.h",
-        "numpy_include/numpy/npy_no_deprecated_api.h",
-        "numpy_include/numpy/arrayobject.h",
-        "numpy_include/numpy/_numpyconfig.h",
-        "numpy_include/numpy/__multiarray_api.h",
+        "numpy_include/numpy/npy_common.h",
         "numpy_include/numpy/npy_cpu.h",
-        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/npy_endian.h",
+        "numpy_include/numpy/npy_interrupt.h",
+        "numpy_include/numpy/npy_math.h",
+        "numpy_include/numpy/npy_no_deprecated_api.h",
+        "numpy_include/numpy/npy_os.h",
         "numpy_include/numpy/numpyconfig.h",
+        "numpy_include/numpy/old_defines.h",
+        "numpy_include/numpy/oldnumeric.h",
+        "numpy_include/numpy/ufunc_api.txt",
+        "numpy_include/numpy/ufuncobject.h",
+        "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h"
+cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/opt/python3.6/lib/python3.6/site-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )